def instances(fi):
    xseq = crfsuite.ItemSequence()
    yseq = crfsuite.StringList()

    for line in fi:
        line = line.strip('\n')
        if not line:
            # An empty line presents an end of a sequence.
            yield xseq, tuple(yseq)
            xseq = crfsuite.ItemSequence()
            yseq = crfsuite.StringList()
            continue

# Split the line with TAB characters.
        fields = line.split('\t')

        # Append attributes to the item.
        item = crfsuite.Item()
        for field in fields[1:]:
            p = field.rfind(':')
            if p == -1:
                # Unweighted (weight=1) attribute.
                item.append(crfsuite.Attribute(field))
            else:
                # Weighted attribute
                item.append(crfsuite.Attribute(field[:p],
                                               float(field[p + 1:])))

        # Append the item to the item sequence.
        xseq.append(item)
        # Append the label to the label sequence.
        yseq.append(fields[0])
Example #2
0
def instances(fi):
    xseq = crfsuite.ItemSequence()
    fi = 'If the'
    fi = fi.split(' ')

    for line in fi:
        print "********************"
        line = line.strip('\n')
        if not line:
            # An empty line presents an end of a sequence.
            yield xseq
            xseq = crfsuite.ItemSequence()
            continue

# Split the line with TAB characters.
        fields = line.split('\t')
        item = crfsuite.Item()
        for field in fields[1:]:
            p = field.rfind(':')
            if p == -1:
                # Unweighted (weight=1) attribute.
                item.append(crfsuite.Attribute(field))
            else:
                # Weighted attribute
                item.append(crfsuite.Attribute(field[:p],
                                               float(field[p + 1:])))

        # Append the item to the item sequence.
        xseq.append(item)
Example #3
0
 def append_raw(self, features_iter, check=False):
     '''
     @features_iter is an iterable of iterables, of tuples or strings.
         type: [[(str, float) | str]], where [] is an iterable
     '''
     for features in features_iter:
         if check:
             features = map(utf8str, features)
         item = crfsuite.Item()
         for feature in features:
             if isinstance(feature, tuple):
                 attribute = crfsuite.Attribute(*feature)
             else:
                 attribute = crfsuite.Attribute(feature)
             item.append(attribute)
         self.append(item)
Example #4
0
    def tag_raw(self, data):
        # data is a list of lists, which may very well be just 1-long
        # data = [['The'], ['man'], ['barked']]
        # The sublists maybe contain tuples (of string->float pairs)
        # data = [['The', ('first', 1)], ['man', 'human', ('first', 0)], ...]
        items = crfsuite.ItemSequence()
        for datum in data:
            item = crfsuite.Item()
            for feature in datum:
                if isinstance(feature, tuple):
                    item.append(crfsuite.Attribute(*feature))
                else:
                    item.append(crfsuite.Attribute(feature))
            items.append(item)

        return self.tag(items)
Example #5
0
    def append_raw(self, features_seq, labels):
        # len(labels) = len(data) = length of sentence / sequence
        # labels is a tuple of strings, data is an tuple/list of lists of strings.
        # this just wraps all the data / labels with crfsuite types
        items = crfsuite.ItemSequence()
        for features in features_seq:
            item = crfsuite.Item()
            for feature in features:
                if isinstance(feature, tuple):
                    attribute = crfsuite.Attribute(*feature)
                else:
                    attribute = crfsuite.Attribute(feature)
                item.append(attribute)
            items.append(item)

        # labels = crfsuite.StringList(labels)
        self.append(items, tuple(labels), 0)
    def instances(self, fi):
        xseq = crfsuite.ItemSequence()

        for line in fi:
            line = line.strip('\n')
            if not line:
                yield xseq
                xseq = crfsuite.ItemSequence()
                continue

            item = crfsuite.Item()
            fields = line.split('\t')
            for field in fields[1:]:
                p = field.rfind(':')
                if p == -1:
                    item.append(crfsuite.Attribute(field))
                else:
                    item.append(
                        crfsuite.Attribute(field[:p], float(field[p + 1:])))
            xseq.append(item)
def to_crfsuite(X):
    """
    Convert an item sequence into an object compatible with crfsuite
    Python module.

    @type   X:      list of mapping objects
    @param  X:      The sequence.
    @rtype          crfsuite.ItemSequence
    @return        The same sequence in crfsuite.ItemSequence type.
    """
    import crfsuite
    xseq = crfsuite.ItemSequence()
    for x in X:
        item = crfsuite.Item()
        for f in x['F']:
            if isinstance(f, str):
                item.append(crfsuite.Attribute(escape(f)))
            else:
                item.append(crfsuite.Attribute(escape(f[0]), f[1]))
        xseq.append(item)
    return xseq
def read_file_to_crfsuite(crf_input_file, crf_trainer, feature_inclusion_list,
                          participant_list):
    # if os.path.isfile('min_max_dataframe'):
    #   min_max = pd.load('min_max_dataframe')
    # else:
    #   min_max = get_min_max_scaling_values (crf_input_file, feature_inclusion_list)
    #min_max = get_min_max_scaling_values (crf_input_file, feature_inclusion_list)
    import crfsuite
    f = open(crf_input_file, 'r')
    feature_index_list = []
    header = []
    xseq = crfsuite.ItemSequence()
    yseq = crfsuite.StringList()
    for line in f:
        # ignore blank lines
        if line.strip(' \t\n\r') == "":
            continue
        if "label" in line:
            feature_index_list = get_feature_index_list(
                line, feature_inclusion_list)
            header = line.split('\t')
            continue
        if "START" in line:
            continue
        if "END" in line:
            print 'found END'
            # exit()
            if participant not in participant_list:
                continue
            crf_trainer.append(xseq, yseq, participant_group)
            xseq = crfsuite.ItemSequence()
            yseq = crfsuite.StringList()
        else:
            item = crfsuite.Item()
            fields = line.split('\t')
            participant = fields[1].strip('"')
            if participant not in participant_list:
                print 'participant ' + participant + ' not found'
                continue
            participant_group = participant_list.index(participant)
            for i in range(0, len(fields)):
                if i in feature_index_list:
                    attribute_name = header[i]
                    if (fields[i] == 'NA'):
                        attribute_val = 0
                    else:
                        attribute_val = float(fields[i])
                    item.append(
                        crfsuite.Attribute(attribute_name, attribute_val))
            xseq.append(item)
            yseq.append(fields[0].strip('"'))
Example #9
0
def read_svm_format(lines):
    # reads lines like:
    # Y 15:0.4 16:0.01 19:3.4
    # or
    # X 4 9 23
    # and iterates over sentences (which are separated by a whitespace-only line).
    # it yield pairs like:
    #   (crfsuite.ItemSequence([
    #      crfsuite.Item([crfsuite.Attribute("15"->0.4), crfsuite.Attribute("16"->0.01), crfsuite.Attribute("19"->3.4)]),
    #      crfsuite.Item([crfsuite.Attribute("4"), crfsuite.Attribute("9"), crfsuite.Attribute("23")])
    #    ]), ("X", "Y", ...))
    # crfsuite.Attribute has 2 properties: attr, and value
    for sentence_lines in group_by_newline(lines):
        data = crfsuite.ItemSequence()
        labels = crfsuite.StringList()
        for line in sentence_lines:
            # Split the line with TAB characters.
            # print '>>>', line
            cells = line.strip().split(' ')
            datum = crfsuite.Item()
            for data_field in cells[1:]:
                # don't split if the whole field is a literal colon
                parts = data_field.rsplit(
                    ':', 1) if data_field != ':' else data_field
                if len(parts) > 1:
                    # we read the optional weight:
                    datum.append(crfsuite.Attribute(parts[0], float(parts[1])))
                else:
                    # otherwise, weight = 1 by default
                    datum.append(crfsuite.Attribute(parts[0]))

            # Append the item to the item sequence.
            data.append(datum)
            # Append the label to the label sequence.
            labels.append(cells[0])
            # empty line is document boundary
        yield (data, tuple(labels))
Example #10
0
def selectActiveData(unselected, selected, model, num):
    select = []
    tagger_bp = crfsuite.Tagger()
    tagger_bp.open(model)
    bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O']
    entropy = []
    for line in unselected:
        sentence, entities_in_sentence = processing.generateSenEntities(
            line, '')
        new_sentence, new_entities = processing.symbolProcess(
            sentence, entities_in_sentence)
        sentence_unicode = new_sentence.decode('utf-8')
        tag_seq = processing.generateTagSeq(sentence_unicode, new_entities)
        feature_string, tags = generateFeature.boundaryFeatureGeneration(
            new_sentence, [], ebao_dic, 'demo', '0')
        try:
            instances = feature_string.strip().split('\n')
        except AttributeError as e:
            print 'feature_string:%s.' % line
        xseq = crfsuite.ItemSequence()
        features = []
        for instance in instances:
            fields = instance.split('\t')
            features.append(fields[2:])
            item = crfsuite.Item()
            for field in fields[2:]:
                item.append(crfsuite.Attribute(field))
            xseq.append(item)
        tagger_bp.set(xseq)

        yseq_b = tagger_bp.viterbi()
        length = len(yseq_b)

        yseq = []
        ie_entity = 0.0
        for i in range(length):
            yseq.append(yseq_b[i])

        for j in range(len(yseq)):
            ie = 0.0  # 信息熵
            for ent_tag in bieso:
                try:
                    tag_prob = tagger_bp.marginal(ent_tag, j)
                    ie += tag_prob * math.log(1 / tag_prob, 2)
                except Exception, e:
                    print line
                    exit(0)
            ie_entity += ie
        entropy.append((line, ie_entity))
Example #11
0
def predictValue(feature_str):
    try:
        instances = feature_str.strip().split('\n')
    except AttributeError as e:
        print 'feature_string:%s.' % feature_str
    xseq = crfsuite.ItemSequence()
    for instance in instances:
        fields = instance.split('\t')
        item = crfsuite.Item()
        for field in fields[2:]:  # S3tag\tS1tag\tFeatures
            item.append(crfsuite.Attribute(field))
        xseq.append(item)
    tagger_b.set(xseq)

    yseq_b = tagger_b.viterbi()
    prob_b = tagger_b.probability(yseq_b)
    return prob_b
Example #12
0
def read_file_to_crfsuite(crf_input_file, feature_inclusion_list, crf_tagger, output, options_dict):    
    sliding_window_length = options_dict['sliding_window_length']
    import crfsuite
    f = open(crf_input_file, 'r')
    #min_max  = pd.load('min_max_dataframe')
    xseq = crfsuite.ItemSequence()
    yseq = crfsuite.StringList()
    overlapped_predictions = []
    for line in f:        
        if line.strip(' \t\n\r')=="":
            continue
        if "label" in line:
            feature_index_list = crf_train.get_feature_index_list(line, feature_inclusion_list)
            header = line.split('\t')
            continue  
        if "START" in line:
            continue        
        if "END" in line:
            #crf_tagger.set(xseq)    
            #prediction_seq = util.convert_to_python_list(crf_tagger.tag(xseq))
            prediction_seq = crf_tagger.tag(xseq)
            label_seq = util.convert_to_python_list (yseq)
            if (sliding_window_length != 0 ):
                overlapped_predictions = write_prediction_to_file (prediction_seq, label_seq, overlapped_predictions, output , options_dict)
            else:
                y_itr = yseq.iterator()
                for prediction in prediction_seq:
                    #print 'straightforward'
                    label = y_itr.next()   
                    output.write(prediction.strip() + "," + label.strip()+"\n")

            xseq = crfsuite.ItemSequence()
            yseq = crfsuite.StringList()
        else:
            item = crfsuite.Item()
            fields = line.split('\t')
            for i in range(0,len(fields)):
                if i in feature_index_list:
                    attribute_name = header[i]
                    if(fields[i] == 'NA'):
                        attribute_val = 0
                    else:
                        attribute_val = float(fields[i])                    
                    item.append(crfsuite.Attribute(attribute_name, attribute_val))
            xseq.append(item)            
            yseq.append(fields[0].strip('"'))
Example #13
0
 def __instances(self,fileRead, wordVectors, windowSize, useManualFeature):
     xseq = crfsuite.ItemSequence()
     yseq = crfsuite.StringList()
     defval = u''
     dataset = codecs.open(fileRead, 'r', 'utf-8')
     
     
     for line in dataset:
         i = 0
         tokens = []
         labels = []
         tokensWithLabels = line.rstrip().split(' ')
         
 #         currentTime = calendar.timegm(time.gmtime())
         
 #         if currentTime - instances.lastTimePrintedMsg > 30.0:
 #             instances.lastTimePrintedMsg = currentTime
 #             logger.info("Processing File. Memory usage: " + str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
 
         for token in tokensWithLabels:
             if token.isspace() or not token:
                 continue
             
             t = token.rsplit(self.__tokenLabelSeparator,1)
             
             if len(t[1]) == 0:
                 logging.getLogger("Logger").warn("It was not found the label from "\
                              "the token " + token + ". We give to this token "\
                              " a label equal to"\
                              " the tokenLabelSeparator( " + self.__tokenLabelSeparator +")" )
                   
                 t[1] = self.__tokenLabelSeparator
             
             try:
                 tokens.append(t[0])
                 labels.append(t[1])
             except Exception:
                 print t
                 print line
                 
                 
         
         halfWindowSize = windowSize / 2
         
         for i in range(len(tokens)):
             beginIndex = i - halfWindowSize
             item = crfsuite.Item()
             
             
             if useManualFeature:
                 item.append(crfsuite.Attribute(self.__createFeature("num" , str(int(re.search('\d', tokens[i]) is not None)))))
                 item.append(crfsuite.Attribute(self.__createFeature("cap" , str(any(c.isupper() for c in tokens[i])))))
                 item.append(crfsuite.Attribute(self.__createFeature("hyp" , str(int(re.search('-', tokens[i]) is not None)))))
              
                 # prefixos
                 item.append(crfsuite.Attribute(self.__createFeature("p1", tokens[i][0] if len(tokens[i]) >= 1 else defval)))
                 item.append(crfsuite.Attribute(self.__createFeature("p2", tokens[i][:2] if len(tokens[i]) >= 2 else defval)))
                 item.append(crfsuite.Attribute(self.__createFeature("p3", tokens[i][:3] if len(tokens[i]) >= 3 else defval)))
                 item.append(crfsuite.Attribute(self.__createFeature("p4", tokens[i][:4] if len(tokens[i]) >= 4 else defval)))
                  
                 # sufixos
                 item.append(crfsuite.Attribute(self.__createFeature("s1", tokens[i][-1] if len(tokens[i]) >= 1 else defval)))
                 item.append(crfsuite.Attribute(self.__createFeature("s2", tokens[i][-2:] if len(tokens[i]) >= 2 else defval)))
                 item.append(crfsuite.Attribute(self.__createFeature("s3", tokens[i][-3:] if len(tokens[i]) >= 3 else defval)))
                 item.append(crfsuite.Attribute(self.__createFeature("s4", tokens[i][-4:] if len(tokens[i]) >= 4 else defval)))
                  
                  
                 for featureTemplate, indexFeature in itertools.izip(CRFSuite.__featuresTemplates, range(len(CRFSuite.__featuresTemplates))):
                     namesFeature = []
                     valuesFeature = []
                      
                     for name, index in featureTemplate:
                         namesFeature.append(name + "[" + str(index) + "]")
                         valuesFeature.append(tokens[i + index] if i + index >= 0 and i + index < len(tokens)  else defval)
                      
                     names = "|".join(namesFeature)
                     values = "|".join(valuesFeature)
                      
                     item.append(crfsuite.Attribute(self.__createFeature(names, values)))
     
             for j in range(windowSize):            
                 index = beginIndex + j
                 label = str(j) + u'|'
                 
                 if index < 0:
                     token = self.__startSentenceSymbol
                 elif index >= len(tokens):
                     token = self.__endSentenceSymbol
                 else:
                     token = tokens[index]
                     
                     for filter in self.__filters:
                         token = filter.filter(token)
                 
                 k = 0
                 for wordvector in wordVectors:
                     if token in wordvector:
                         wv = wordvector[token]
                     else:
                         for unknownToken in self.__unknownTokens:
                             if unknownToken in wordvector:
                                 wv = wordvector[unknownToken]
                                 break;
                     
                     for number in wv:
                         item.append(crfsuite.Attribute(self.__createFeature(label + str(k),'_'), number))
                         
                         k += 1
                         
             xseq.append(item)
             yseq.append(unicodeToSrt(labels[i]))
             
         yield xseq, tuple(yseq)
         xseq = crfsuite.ItemSequence()
         yseq = crfsuite.StringList()
Example #14
0
def semiSupervisedProcessing(model_previous, fsamples, ie_value, ebao_dic):
    tagger_bp = crfsuite.Tagger()
    tagger_bp.open(model_previous)
    bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O']
    cdd4training_semi = []
    cdd4training_semi_number = 0
    for line in fsamples:
        # 用识别的实体过滤样例
        sentence, entities_in_sentence = generateSenEntities(line)
        new_sentence, new_entities = symbolProcess(sentence,
                                                   entities_in_sentence)
        sentence_unicode = new_sentence.decode('utf-8')
        tag_seq = generateTagSeq(sentence_unicode, new_entities)
        feature_string, tags = generateFeature.boundaryFeatureGeneration(
            new_sentence, [], ebao_dic, 'demo', '0')
        try:
            instances = feature_string.strip().split('\n')
        except AttributeError as e:
            print 'feature_string:%s.' % line
        xseq = crfsuite.ItemSequence()
        features = []
        for instance in instances:
            fields = instance.split('\t')
            features.append(fields[2:])
            item = crfsuite.Item()
            for field in fields[2:]:
                item.append(crfsuite.Attribute(field))
            xseq.append(item)
        tagger_bp.set(xseq)

        yseq_b = tagger_bp.viterbi()
        length = len(yseq_b)

        yseq = []
        for i in range(length):
            yseq.append(yseq_b[i])
        # 标记优化处理
        sen_ent_list1, start_end_list1 = evaluation.generateEntList([yseq_b])
        sen_ent_list2, start_end_list2 = evaluation.generateEntList([tag_seq])
        tagged_ents_length = len(start_end_list1[0])
        if tagged_ents_length == 0: continue

        ents = []
        selected_entity = 0
        ent_index = 0
        for i in range(tagged_ents_length):
            ent_start = start_end_list1[0][i][0]
            if ent_start < ent_index: continue
            flag = 0
            ent_end = start_end_list1[0][i][1]
            ent_content = sentence_unicode[ent_start:ent_end].encode('utf-8')
            ie_entity = 0.0
            for j in range(ent_start, ent_end):
                ie = 0.0  # 信息熵
                for ent_tag in bieso:
                    tag_prob = tagger_bp.marginal(ent_tag, j)
                    ie += tag_prob * math.log(1 / tag_prob, 2)
                ie_entity += ie
            # ie_ave = ie_entity / (ent_end - ent_start)
            # if ebao_dic.has_key(ent_content) and ie_ave > ie_value:
            if ie_entity > ie_value:
                for k in range(len(start_end_list2[0])):
                    start_m = start_end_list2[0][k][0]
                    end_m = start_end_list2[0][k][1]
                    if ent_start >= start_m and ent_end <= end_m:
                        # if end_m - start_m < 3: break
                        ents.append(
                            Entity(
                                sentence_unicode[start_m:end_m].encode(
                                    'utf-8'), int(start_m), int(end_m),
                                'entity'))
                        ent_index = end_m
                        flag = 1
                        break
                if flag == 0:
                    continue

                    if not ebao_dic.has_key(ent_content): continue
                    ents.append(
                        Entity(ent_content, int(ent_start), int(ent_end),
                               'entity'))
                    ent_index = end_m
                selected_entity += 1

        if selected_entity == 0: continue

        char_entity_tag_list = generateFeature.getCharEntityFPTag(
            sentence_unicode, ents, '1')
        char_entity_tag_list = generateFeature.getCharEntityPartialTag(
            char_entity_tag_list)

        new_feature_str = ''
        for j in range(length):
            new_feature_str += '%s\t%s\n' % (char_entity_tag_list[j][1][0],
                                             '\t'.join(features[j]))

        cdd4training_semi.append(new_feature_str.strip())
        cdd4training_semi_number += 1
    return cdd4training_semi, cdd4training_semi_number
Example #15
0
def mainfunction(sen, postProcess, texttype, index):
    model_b = os.path.join(root, './models/boundarymodel-' + index)
    model_c = os.path.join(root, './models/classmodel-' + index)

    ner_lines = ''

    tagger_b = crfsuite.Tagger()
    tagger_b.open(model_b)
    tagger_c = crfsuite.Tagger()
    tagger_c.open(model_c)
    bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O']

    line = sen.strip()

    # model_2_layer
    # boundary
    feature_string = ''
    instances = []
    feature_string, tags = generateFeature.boundaryFeatureGeneration(
        line, [], ebao_dic, 'demo', '0')
    try:
        instances = feature_string.strip().split('\n')
    except AttributeError as e:
        print 'feature_string:%s.' % feature_string
    xseq = crfsuite.ItemSequence()
    for instance in instances:
        fields = instance.split('\t')
        item = crfsuite.Item()
        for field in fields[2:]:
            item.append(crfsuite.Attribute(field))
        xseq.append(item)
    tagger_b.set(xseq)
    yseq_b = tagger_b.viterbi()

    line_unicode = line.decode('utf-8')

    model_chosen = '2layer'
    # class
    sen_ent_list1, start_end_list1 = evaluation.generateEntList([yseq_b])

    length = len(sen_ent_list1[0])
    # length 为0时
    entities = []
    new_entities = []
    for j in range(length):
        ent_start = sen_ent_list1[0][j][0]
        ent_end = sen_ent_list1[0][j][1]
        ent_type = sen_ent_list1[0][j][2]
        ent_content = line_unicode[ent_start:ent_end].encode('utf-8')
        entities.append(Entity(ent_content, ent_start, ent_end, ent_type))
    feature_c, sen_ent4error = generateFeature.classFeatureGeneration(
        line, entities, ebao_dic, texttype)
    instances = feature_c.strip().split('\n\n')
    ents_type = []
    for instance in instances:
        xseq = crfsuite.ItemSequence()
        fields = instance.split('\t')
        item = crfsuite.Item()
        for field in fields[1:]:
            item.append(crfsuite.Attribute(field))
        xseq.append(item)
        tagger_c.set(xseq)
        yseq_c = tagger_c.viterbi()
        ents_type.append(yseq_c[0])
    # postProcessing

    new_yseq = ['O' for i in range(len(line_unicode))]
    for j in range(len(entities)):
        start = entities[j].start_pos
        end = entities[j].end_pos
        enttype = ents_type[j]
        if start + 1 == end:
            new_yseq[start] = 'S-' + enttype
            continue
        new_yseq[start] = 'B-' + enttype
        for k in range(start + 1, end - 1):
            new_yseq[k] = 'I-' + enttype
        new_yseq[end - 1] = 'E-' + enttype

    if postProcess == '1':  # 评价中的start_end_list没有调整
        new_yseq = postProcessing.twoProcessings(line_unicode, new_yseq,
                                                 ebao_dic, texttype)

    ents1, s_e_list1 = evaluation.generateEntList([new_yseq])
    new_entities = ents1[0]

    entity_list = ''
    length = len(new_entities)
    for i in range(length):
        content = line_unicode[new_entities[i][0]:new_entities[i][1]]
        enttype = new_entities[i][2]
        if enttype == '':
            print line_unicode.encode('utf8'), line_unicode[
                new_entities[i][0]:new_entities[i][1]].encode('utf8')
        entity_list += content.encode(
            'utf8') + '[' + en_cn_dic[enttype] + ']\n'
    return entity_list, new_yseq
Example #16
0
def predictClassAfterBoundaryAndEval(boundary_result, sentence_list,
                                     sen_tags_list, classmodel_file, ebao_dic,
                                     post_processing, texttype):
    tagger = crfsuite.Tagger()
    tagger.open(classmodel_file)
    result_tags_list = evaluation.generateTagList(boundary_result)
    # 1是系统结果,2是标准数据
    sen_ent_list1, start_end_list1 = evaluation.generateEntList(
        result_tags_list)  # 只有一个entity类
    sen_ent_list2, start_end_list2 = evaluation.generateEntList(
        sen_tags_list)  # 多个类别
    length = len(sen_ent_list1)
    new_sen_ent_list1 = []
    sen_inside_ent_list = []
    for i in range(length):
        # 生成对应的实体数组
        sentence = sentence_list[i]
        sentence_unicode = sentence.decode('utf-8')
        entities = []
        new_entities = []
        s_e_list = []
        if len(sen_ent_list1[i]) == 0:
            sen_inside_ent_list.append([['']])
            new_sen_ent_list1.append(sen_ent_list1[i])
            continue
        for j in range(len(sen_ent_list1[i])):
            ent_start = sen_ent_list1[i][j][0]
            ent_end = sen_ent_list1[i][j][1]
            ent_type = sen_ent_list1[i][j][2]
            ent_content = sentence_unicode[ent_start:ent_end].encode('utf-8')
            entities.append(Entity(ent_content, ent_start, ent_end, ent_type))
            s_e_list.append([sentence, ent_content])
        sen_inside_ent_list.append(s_e_list)
        feature_c, sen_ent4error = generateFeature.classFeatureGeneration(
            sentence, entities, ebao_dic, texttype)

        instances = feature_c.strip().split('\n\n')
        ents_type = []
        for instance in instances:
            xseq = crfsuite.ItemSequence()
            fields = instance.split('\t')
            item = crfsuite.Item()
            for field in fields[1:]:
                item.append(crfsuite.Attribute(field))
            xseq.append(item)
            tagger.set(xseq)
            yseq = tagger.viterbi()
            ents_type.append(yseq[0])
        # postProcessing
        if post_processing == '1':  # 评价中的start_end_list没有调整
            new_yseq = ['O' for i in range(len(sentence_unicode))]
            for j in range(len(entities)):
                start = entities[j].start_pos
                end = entities[j].end_pos
                enttype = ents_type[j]
                if start + 1 == end:
                    new_yseq[start] = 'S-' + enttype
                    continue
                new_yseq[start] = 'B-' + enttype
                for k in range(start + 1, end - 1):
                    new_yseq[k] = 'I-' + enttype
                new_yseq[end - 1] = 'E-' + enttype
            new_yseq1 = postProcessing.twoProcessings(sentence_unicode,
                                                      new_yseq, ebao_dic,
                                                      texttype)
            tag_list1 = []
            tag_list1.append(new_yseq1)
            ents1, s_e_list1 = evaluation.generateEntList(tag_list1)
            new_entities = ents1[0]
        else:
            for k in range(len(ents_type)):
                try:
                    new_entities.append((sen_ent_list1[i][k][0],
                                         sen_ent_list1[i][k][1], ents_type[k]))
                except Exception as e:
                    print e
                    print len(sen_ent_list1[i]), len(ents_type)
                    print sentence
                    print feature_c
        new_sen_ent_list1.append(new_entities)

    # 错误分析
    ent_count_result, ent_count_result_o = evaluation.countEntList(
        new_sen_ent_list1, sen_ent_list2, start_end_list1, start_end_list2,
        sen_inside_ent_list)
    evaluation.measurePRF(ent_count_result_o)
Example #17
0
def getNerResult(inputstring, tagger_b, tagger_c, bieso):
    # inputstring = unicode(inputstring)
    # inputsentence = tools.uniformSignal(inputstring.encode('utf8'))
    lines = tools.sentence_split(inputstring)

    ent_list = ''
    for line in lines:
        line = line.strip()
        # 去除标签部分,以<开头且以>结尾的过滤
        #if line == '' or line[0] == '<' and line[-1] == '>' : continue
        if line == '': continue

        # model_2_layer
        # boundary
        feature_string = ''
        instances = []
        feature_string, tags = generateFeature.boundaryFeatureGeneration(
            line, [], ebao_dic, 'demo', '0')
        try:
            instances = feature_string.strip().split('\n')
        except AttributeError as e:
            print 'feature_string:%s.' % feature_string
        xseq = crfsuite.ItemSequence()
        for instance in instances:
            fields = instance.split('\t')
            item = crfsuite.Item()
            for field in fields[2:]:
                item.append(crfsuite.Attribute(field))
            xseq.append(item)
        tagger_b.set(xseq)

        yseq_b = tagger_b.viterbi()
        prob_b = tagger_b.probability(yseq_b)
        line_unicode = line.decode('utf-8')

        model_chosen = '2layer'
        # class
        sen_ent_list1, start_end_list1 = evaluation.generateEntList([yseq_b])

        length = len(sen_ent_list1[0])
        # length 为0时
        sentence = line
        entities = []
        for j in range(length):
            ent_start = sen_ent_list1[0][j][0]
            ent_end = sen_ent_list1[0][j][1]
            ent_type = sen_ent_list1[0][j][2]
            ent_content = sentence.decode('utf-8')[ent_start:ent_end].encode(
                'utf-8')
            entities.append(Entity(ent_content, ent_start, ent_end, ent_type))
        feature_c, sen_ent4error = generateFeature.classFeatureGeneration(
            sentence, entities, ebao_dic, '')
        instances = feature_c.strip().split('\n\n')
        ents_type = []
        for instance in instances:
            xseq = crfsuite.ItemSequence()
            fields = instance.split('\t')
            item = crfsuite.Item()
            for field in fields[1:]:
                item.append(crfsuite.Attribute(field))
            xseq.append(item)
            tagger_c.set(xseq)
            yseq_c = tagger_c.viterbi()
            ents_type.append(yseq_c[0])
        new_yseq = ['O' for i in range(len(line_unicode))]
        for j in range(len(entities)):
            start = entities[j].start_pos
            end = entities[j].end_pos
            if start + 1 == end:
                new_yseq[start] = 'S-' + ents_type[j]
                continue
            new_yseq[start] = 'B-' + ents_type[j]
            for k in range(start + 1, end - 1):
                new_yseq[k] = 'I-' + ents_type[j]
            new_yseq[end - 1] = 'E-' + ents_type[j]

        ents = generateNerInSentence(line_unicode, new_yseq, model_chosen,
                                     ebao_dic)
        ent_list += ents
    return ent_list
Example #18
0
def mainfunction(inputstring, taggerb, taggerc):
    if inputstring == '':
        sentence_ner = '请输入句子'
        return sentence_ner

    # 一些句子预处理
    inputsentence = tools.uniformSignal(inputstring)

    ner_lines = ''

    bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O']

    new_term_list = ''

    for single_line in inputsentence.split('\n'):
        lines = tools.sentence_split(single_line)
        ner_line = ''
        term_list = ''
        for line in lines:
            line = line.strip()
            # 去除标签部分,以<开头且以>结尾的过滤
            if line == '' or line[0] == '<' and line[-1] == '>': continue

            # model_2_layer
            # boundary
            feature_string = ''
            instances = []
            feature_string, tags = generateFeature.boundaryFeatureGeneration(
                line, [], ebao_dic, 'demo', '0')
            try:
                instances = feature_string.strip().split('\n')
            except AttributeError as e:
                print 'feature_string:%s.' % feature_string
            xseq = crfsuite.ItemSequence()
            for instance in instances:
                fields = instance.split('\t')
                item = crfsuite.Item()
                for field in fields[2:]:
                    item.append(crfsuite.Attribute(field))
                xseq.append(item)
            taggerb.set(xseq)

            yseq_b = taggerb.viterbi()
            prob_b = taggerb.probability(yseq_b)
            line_unicode = line.decode('utf-8')

            # for t, y in enumerate(yseq_b):
            # # Output the predicted labels with their marginal probabilities.
            #     ner_line  += '%s:%f\n' % (y, taggerb.marginal(y, t))

            model_chosen = '2layer'
            # class
            sen_ent_list1, start_end_list1 = evaluation.generateEntList(
                [yseq_b])

            length = len(sen_ent_list1[0])
            # length 为0时
            sentence = line
            entities = []
            for j in range(length):
                ent_start = sen_ent_list1[0][j][0]
                ent_end = sen_ent_list1[0][j][1]
                ent_type = sen_ent_list1[0][j][2]
                ent_content = sentence.decode(
                    'utf-8')[ent_start:ent_end].encode('utf-8')
                entities.append(
                    Entity(ent_content, ent_start, ent_end, ent_type))
            feature_c, sen_ent4error = generateFeature.classFeatureGeneration(
                sentence, entities, ebao_dic, texttype)
            instances = feature_c.strip().split('\n\n')
            ents_type = []
            for instance in instances:
                xseq = crfsuite.ItemSequence()
                fields = instance.split('\t')
                item = crfsuite.Item()
                for field in fields[1:]:
                    item.append(crfsuite.Attribute(field))
                xseq.append(item)
                taggerc.set(xseq)
                yseq_c = taggerc.viterbi()
                ents_type.append(yseq_c[0])
            new_yseq = ['O' for i in range(len(line_unicode))]
            for j in range(len(entities)):
                start = entities[j].start_pos
                end = entities[j].end_pos
                if start + 1 == end:
                    new_yseq[start] = 'S-' + ents_type[j]
                    continue
                new_yseq[start] = 'B-' + ents_type[j]
                for k in range(start + 1, end - 1):
                    new_yseq[k] = 'I-' + ents_type[j]
                new_yseq[end - 1] = 'E-' + ents_type[j]

            sen_ent_colored, ent_list = generateNerInSentence(
                line_unicode, new_yseq, model_chosen, ebao_dic)

            new_term_list += ent_list

            if sen_ent_colored == '': sen_ent_colored = line
            # ner_lines += '<p>' + sen_ent_colored + '</p>'
            # ner_lines += '<p>' + ent_list + '</p>'
            ner_line += sen_ent_colored
            term_list += ent_list

        ner_lines += '<p>' + ner_line + '</p>'
        ner_lines += '<p>' + term_list + '</p>'
        ner_lines += '<br/>'

    return ner_lines, new_term_list