def test1(self): json_ltp = demjson.decode_file("./" + "ltp_0_test.json") data_ltp = json_ltp["data"] number_of_phrase = len(data_ltp) for j in range(3): ltp_entities_info = nerstat.find_entity_info(data_ltp[j], "ltp") entities = ltp_entities_info["entity"] ner_tags = ltp_entities_info["entity_type"] entity_unicode_lens = ltp_entities_info["entity_unicode_len"] startposes = ltp_entities_info["startpos"] endposes = ltp_entities_info["endpos"] num = len(entities) for i in range(num): print entities[i].decode("utf-8") print ner_tags[i] print startposes[i], endposes[i], entity_unicode_lens[i]
def test2(self): jsonObject = {} jsonObjArray = [] # create phrase_id <--> doc_id dictionary for look up doc_id by phrase_id # for boson NER myDocDic = nerstat.setupDoc_ID_Dic() # 文件个数 for i in range(1): # there is only 1 file for boson # for i in range(22): print i json_corenlp = demjson.decode_file("./" + "corenlp_" + str(i) + "_test.json", encoding="utf8") json_ltp = demjson.decode_file("./" + "ltp_" + str(i) + "_test.json", encoding="utf8") json_hanlp = demjson.decode_file("./" + "hanlp_" + str(i) + "_test.json", encoding="utf8") json_fnlp = demjson.decode_file("./" + "fnlp_" + str(i) + "_test.json", encoding="utf8") json_fool = demjson.decode_file("./" + "foolnltk_" + str(i) + "_test.json", encoding="utf8") # data array data_corenlp = json_corenlp["data"] data_ltp = json_ltp["data"] data_hanlp = json_hanlp["data"] data_fnlp = json_fnlp["data"] data_fool = json_fool["data"] number_of_phrase = len(data_ltp) for j in range(1): # for j in range(3): # 合并organization/personal/location, 获取entities corenlp_entities_info = nerstat.find_entity_info( data_corenlp[j], "corenlp") ltp_entities_info = nerstat.find_entity_info( data_ltp[j], "ltp") hanlp_entities_info = nerstat.find_entity_info( data_hanlp[j], "hanlp") fnlp_entities_info = nerstat.find_entity_info( data_fnlp[j], "fnlp") fool_entities_info = nerstat.find_foolnltk_entity(data_fool[j]) # ltp: 1, corenlp: 2, hanlp 4, 根据sum来得知两两是否相同 # 实体名、实体长度、实体起始位置 # 1. 先比较三者的实体个数是否相等 # 2. 实际比较 # 2.1 PERSON 个数,有几个相同 # 2.2 LOCATION 个数,有几个相同 # 2.3 ORGANIZATION个数,有几个相同 # 通过set的 &, in, not in来处理 phrase_id = corenlp_entities_info["phrase_id"] # ---------------------------------------------------------- doc_id, sentence = myDocDic[phrase_id] boson_entities_info = nerstat.find_boson_entity( doc_id, phrase_id) # ---------------------------------------------------------- # update space info spaceinfo = self.getspaceinfo(sentence) # ltp_entities_info = self.updateentityoffset(spaceinfo, ltp_entities_info) # fnlp_entities_info = self.updateentityoffset(spaceinfo, fnlp_entities_info) # boson_entities_info = self.updateentityoffset(spaceinfo, boson_entities_info) self.updateentityoffset(spaceinfo, ltp_entities_info) self.updateentityoffset(spaceinfo, fnlp_entities_info) self.updateentityoffset(spaceinfo, boson_entities_info) print "test verification 0" self.printentityinfo("corenlp", corenlp_entities_info) self.printentityinfo("ltp", ltp_entities_info) self.printentityinfo("hanlp", hanlp_entities_info) self.printentityinfo("fnlp", fnlp_entities_info) self.printentityinfo("fool", fool_entities_info) self.printentityinfo("boson", boson_entities_info) np_entity_corenlp = np.array(corenlp_entities_info["entity"]) np_entity_ltp = np.array(ltp_entities_info["entity"]) np_entity_hanlp = np.array(hanlp_entities_info["entity"]) np_entity_fnlp = np.array(fnlp_entities_info["entity"]) np_entity_fool = np.array(fool_entities_info["entity"]) np_entity_boson = np.array(boson_entities_info["entity"]) np_type_corenlp = np.array( corenlp_entities_info["entity_type"]) np_type_ltp = np.array(ltp_entities_info["entity_type"]) np_type_hanlp = np.array(hanlp_entities_info["entity_type"]) np_type_fnlp = np.array(fnlp_entities_info["entity_type"]) np_type_fool = np.array(fool_entities_info["entity_type"]) np_type_boson = np.array(boson_entities_info["entity_type"]) # for the overlap of entity between corenlp, ltp and hanlp np_startpos_corenlp = np.array( corenlp_entities_info["startpos"]) np_startpos_ltp = np.array(ltp_entities_info["startpos"]) np_startpos_hanlp = np.array(hanlp_entities_info["startpos"]) np_startpos_fnlp = np.array(fnlp_entities_info["startpos"]) np_startpos_fool = np.array(fool_entities_info["startpos"]) np_startpos_boson = np.array(boson_entities_info["startpos"]) np_endpos_corenlp = np.array(corenlp_entities_info["endpos"]) np_endpos_ltp = np.array(ltp_entities_info["endpos"]) np_endpos_hanlp = np.array(hanlp_entities_info["endpos"]) np_endpos_fnlp = np.array(fnlp_entities_info["endpos"]) np_endpos_fool = np.array(fool_entities_info["endpos"]) np_endpos_boson = np.array(boson_entities_info["endpos"]) np_entitylen_corenlp = np.array( corenlp_entities_info["entity_unicode_len"]) np_entitylen_ltp = np.array( ltp_entities_info["entity_unicode_len"]) np_entitylen_hanlp = np.array( hanlp_entities_info["entity_unicode_len"]) np_entitylen_fnlp = np.array( fnlp_entities_info["entity_unicode_len"]) np_entitylen_fool = np.array( fool_entities_info["entity_unicode_len"]) np_entitylen_boson = np.array( boson_entities_info["entity_unicode_len"]) print "first verification" print "-------------corenlp----------------" self.printnp(" np_entity_corenlp: ", np_entity_corenlp) self.printnp(" np_type_corenlp: ", np_type_corenlp) self.printnp(" np_startpos_corenlp: ", np_startpos_corenlp) self.printnp(" np_endpos_corenlp: ", np_endpos_corenlp) self.printnp("np_entitylen_corenlp: ", np_entitylen_corenlp) print "-------------ltp----------------" self.printnp(" np_entity_ltp: ", np_entity_ltp) self.printnp(" np_type_ltp: ", np_type_ltp) self.printnp(" np_startpos_ltp: ", np_startpos_ltp) self.printnp(" np_endpos_ltp: ", np_endpos_ltp) self.printnp("np_entitylen_ltp: ", np_entitylen_ltp) print "-------------hanlp----------------" self.printnp(" np_entity_hanlp: ", np_entity_hanlp) self.printnp(" np_type_hanlp: ", np_type_hanlp) self.printnp(" np_startpos_hanlp: ", np_startpos_hanlp) self.printnp(" np_endpos_hanlp: ", np_endpos_hanlp) self.printnp("np_entitylen_hanlp: ", np_entitylen_hanlp) print "-------------fnlp----------------" self.printnp(" np_entity_fnlp: ", np_entity_fnlp) self.printnp(" np_type_fnlp: ", np_type_fnlp) self.printnp(" np_startpos_fnlp: ", np_startpos_fnlp) self.printnp(" np_endpos_fnlp: ", np_endpos_fnlp) self.printnp("np_entitylen_fnlp: ", np_entitylen_fnlp) print "-------------fool----------------" self.printnp(" np_entity_fool: ", np_entity_fool) self.printnp(" np_type_fool: ", np_type_fool) self.printnp(" np_startpos_fool: ", np_startpos_fool) self.printnp(" np_endpos_fool: ", np_endpos_fool) self.printnp("np_entitylen_fool: ", np_entitylen_fool) print "-------------boson----------------" self.printnp(" np_entity_boson: ", np_entity_boson) self.printnp(" np_type_boson: ", np_type_boson) self.printnp(" np_startpos_boson: ", np_startpos_boson) self.printnp(" np_endpos_boson: ", np_endpos_boson) self.printnp("np_entitylen_boson: ", np_entitylen_boson) subJsonObject = {} subJsonObject["phrase_id"] = phrase_id # 统计每个NER中PERSON、LOCATION, ORGANIZATION的个数 # for i in ["PERSON", "LOCATION", "ORGANIZATION"]: c_person_indexes = nerstat.find_all_index( np_type_corenlp, "PERSON") c_location_indexes = nerstat.find_all_index( np_type_corenlp, "LOCATION") c_organization_indexes = nerstat.find_all_index( np_type_corenlp, "ORGANIZATION") # for i in ["PERSON", "LOCATION", "ORGANIZATION"]: l_person_indexes = nerstat.find_all_index( np_type_ltp, "PERSON") l_location_indexes = nerstat.find_all_index( np_type_ltp, "LOCATION") l_organization_indexes = nerstat.find_all_index( np_type_ltp, "ORGANIZATION") # for i in ["PERSON", "LOCATION", "ORGANIZATION"]: h_person_indexes = nerstat.find_all_index( np_type_hanlp, "PERSON") h_location_indexes = nerstat.find_all_index( np_type_hanlp, "LOCATION") h_organization_indexes = nerstat.find_all_index( np_type_hanlp, "ORGANIZATION") # FNLP # for i in ["PERSON", "LOCATION", "ORGANIZATION"]: f_person_indexes = nerstat.find_all_index( np_type_fnlp, "PERSON") f_location_indexes = nerstat.find_all_index( np_type_fnlp, "LOCATION") f_organization_indexes = nerstat.find_all_index( np_type_fnlp, "ORGANIZATION") # foolnltk # for i in ["PERSON", "LOCATION", "ORGANIZATION"]: o_person_indexes = nerstat.find_all_index( np_type_fool, "PERSON") o_location_indexes = nerstat.find_all_index( np_type_fool, "LOCATION") o_organization_indexes = nerstat.find_all_index( np_type_fool, "ORGANIZATION") # boson_ner # for i in ["PERSON", "LOCATION", "ORGANIZATION"]: b_person_indexes = nerstat.find_all_index( np_type_boson, "PERSON") b_location_indexes = nerstat.find_all_index( np_type_boson, "LOCATION") b_organization_indexes = nerstat.find_all_index( np_type_boson, "ORGANIZATION") print "second verification" print "c_person_indexes: ", c_person_indexes print "c_organization_indexes: ", c_organization_indexes print "l_person_indexes: ", l_person_indexes print "l_organization_indexes: ", l_organization_indexes print "h_person_indexes: ", h_person_indexes print "h_organization_indexes: ", h_organization_indexes print "f_person_indexes: ", f_person_indexes print "f_organization_indexes: ", f_organization_indexes print "o_person_indexes: ", o_person_indexes print "o_organization_indexes: ", o_organization_indexes print "b_person_indexes: ", b_person_indexes print "b_organization_indexes: ", b_organization_indexes corenlp_amount_stat = [ len(c_person_indexes), len(c_location_indexes), len(c_organization_indexes) ] ltp_amount_stat = [ len(l_person_indexes), len(l_location_indexes), len(l_organization_indexes) ] hanlp_amount_stat = [ len(h_person_indexes), len(h_location_indexes), len(h_organization_indexes) ] fnlp_amount_stat = [ len(f_person_indexes), len(f_location_indexes), len(f_organization_indexes) ] fool_amount_stat = [ len(o_person_indexes), len(o_location_indexes), len(o_organization_indexes) ] boson_amount_stat = [ len(b_person_indexes), len(b_location_indexes), len(b_organization_indexes) ] # ------------------------------------------------ subJsonObject["corenlp"] = corenlp_amount_stat subJsonObject["ltp"] = ltp_amount_stat subJsonObject["hanlp"] = hanlp_amount_stat subJsonObject["fnlp"] = fnlp_amount_stat subJsonObject["fool"] = fool_amount_stat subJsonObject["boson"] = boson_amount_stat # ------------------------------------------------ # ## 2018-01-31 set operation deleted # for overlap inforamtion bc_overlapped, bc_matched = nerstat.find_overlaps( boson_entities_info, b_person_indexes, b_location_indexes, b_organization_indexes, corenlp_entities_info, c_person_indexes, c_location_indexes, c_organization_indexes) bl_overlapped, bl_matched = nerstat.find_overlaps( boson_entities_info, b_person_indexes, b_location_indexes, b_organization_indexes, ltp_entities_info, l_person_indexes, l_location_indexes, l_organization_indexes) bh_overlapped, bh_matched = nerstat.find_overlaps( boson_entities_info, b_person_indexes, b_location_indexes, b_organization_indexes, hanlp_entities_info, h_person_indexes, h_location_indexes, h_organization_indexes) bf_overlapped, bf_matched = nerstat.find_overlaps( boson_entities_info, b_person_indexes, b_location_indexes, b_organization_indexes, fnlp_entities_info, f_person_indexes, f_location_indexes, f_organization_indexes) bo_overlapped, bo_matched = nerstat.find_overlaps( boson_entities_info, b_person_indexes, b_location_indexes, b_organization_indexes, fool_entities_info, o_person_indexes, o_location_indexes, o_organization_indexes) # ------------------------------------------------ subJsonObject["bc"] = bc_matched subJsonObject["bl"] = bl_matched subJsonObject["bh"] = bh_matched subJsonObject["bf"] = bf_matched subJsonObject["bo"] = bo_matched # ------------------------------------------------ # ------------------------------------------------ subJsonObject["bc_overlapped"] = bc_overlapped subJsonObject["bl_overlapped"] = bl_overlapped subJsonObject["bh_overlapped"] = bh_overlapped subJsonObject["bf_overlapped"] = bf_overlapped subJsonObject["bo_overlapped"] = bo_overlapped # subJsonObject["clh"] = clh # ------------------------------------------------ print "verification 3" print "bc_matched", bc_matched print "bl_matched", bl_matched print "bh_matched", bh_matched print "bf_matched", bf_matched print "bo_matched", bo_matched print "bc_overlapped", bc_overlapped print "bl_overlapped", bl_overlapped print "bh_overlapped", bh_overlapped print "bf_overlapped", bf_overlapped print "bo_overlapped", bo_overlapped jsonObjArray.append(subJsonObject) jsonObject["stats"] = jsonObjArray # print "*" * 20 print "writing to : test_o_" + str(i) + ".json" # demjson.encode_to_file("./" + str(i) + ".json", encoding="utf-8") with open("test_o_" + str(i) + ".json", "w") as fp: json.dump(jsonObject, fp) print "done."
def test1(self): # jsonObject = {} # jsonObjArray = [] # create phrase_id <--> doc_id dictionary for look up doc_id by phrase_id # for boson NER myDocDic, myTextDic = setupDic() # 文件个数 for i in range(1): # there is only 1 file for boson # for i in range(22): print i json_corenlp = demjson.decode_file("./" + "corenlp_" + str(i) + ".json") json_ltp = demjson.decode_file("./" + "ltp_" + str(i) + ".json") json_hanlp = demjson.decode_file("./" + "hanlp_" + str(i) + ".json") json_fnlp = demjson.decode_file("./" + "fnlp_" + str(i) + ".json") json_fool = demjson.decode_file("./" + "foolnltk_" + str(i) + ".json") # data array data_corenlp = json_corenlp["data"] data_ltp = json_ltp["data"] data_hanlp = json_hanlp["data"] data_fnlp = json_fnlp["data"] data_fool = json_fool["data"] number_of_phrase = len(data_corenlp) for j in range(100): #for j in range(number_of_phrase): # for j in range(3): # 合并organization/personal/location, 获取entities print "No. " + str(j) corenlp_entities_info = nerstat.find_entity_info( data_corenlp[j], "corenlp") ltp_entities_info = nerstat.find_entity_info( data_ltp[j], "ltp") hanlp_entities_info = nerstat.find_entity_info( data_hanlp[j], "hanlp") fnlp_entities_info = nerstat.find_entity_info( data_fnlp[j], "fnlp") fool_entities_info = nerstat.find_foolnltk_entity(data_fool[j]) # ltp: 1, corenlp: 2, hanlp 4, 根据sum来得知两两是否相同 # 实体名、实体长度、实体起始位置 # 1. 先比较三者的实体个数是否相等 # 2. 实际比较 # 2.1 PERSON 个数,有几个相同 # 2.2 LOCATION 个数,有几个相同 # 2.3 ORGANIZATION个数,有几个相同 # 通过set的 &, in, not in来处理 phrase_id = corenlp_entities_info["phrase_id"] # ---------------------------------------------------------- doc_id = myDocDic[phrase_id] phrase = myTextDic[phrase_id] boson_entities_info = nerstat.find_boson_entity( doc_id, phrase_id) # ---------------------------------------------------------- np_entity_corenlp = np.array(corenlp_entities_info["entity"]) np_entity_ltp = np.array(ltp_entities_info["entity"]) np_entity_hanlp = np.array(hanlp_entities_info["entity"]) np_entity_fnlp = np.array(fnlp_entities_info["entity"]) np_entity_fool = np.array(fool_entities_info["entity"]) np_entity_boson = np.array(boson_entities_info["entity"]) np_type_corenlp = np.array( corenlp_entities_info["entity_type"]) np_type_ltp = np.array(ltp_entities_info["entity_type"]) np_type_hanlp = np.array(hanlp_entities_info["entity_type"]) np_type_fnlp = np.array(fnlp_entities_info["entity_type"]) np_type_fool = np.array(fool_entities_info["entity_type"]) np_type_boson = np.array(boson_entities_info["entity_type"]) #phrase = "".join(np_entity_corenlp) # # # for the overlap of entity between corenlp, ltp and hanlp # np_startpos_corenlp = np.array(corenlp_entities_info["startpos"]) # np_startpos_ltp = np.array(ltp_entities_info["startpos"]) # np_startpos_hanlp = np.array(hanlp_entities_info["startpos"]) # np_startpos_fnlp = np.array(fnlp_entities_info["startpos"]) # np_startpos_fool = np.array(fool_entities_info["startpos"]) # np_startpos_boson = np.array(boson_entities_info["startpos"]) # # np_endpos_corenlp = np.array(corenlp_entities_info["endpos"]) # np_endpos_ltp = np.array(ltp_entities_info["endpos"]) # np_endpos_hanlp = np.array(hanlp_entities_info["endpos"]) # np_endpos_fnlp = np.array(fnlp_entities_info["endpos"]) # np_endpos_fool = np.array(fool_entities_info["endpos"]) # np_endpos_boson = np.array(boson_entities_info["endpos"]) # # np_entitylen_corenlp = np.array(corenlp_entities_info["entity_unicode_len"]) # np_entitylen_ltp = np.array(ltp_entities_info["entity_unicode_len"]) # np_entitylen_hanlp = np.array(hanlp_entities_info["entity_unicode_len"]) # np_entitylen_fnlp = np.array(fnlp_entities_info["entity_unicode_len"]) # np_entitylen_fool = np.array(fool_entities_info["entity_unicode_len"]) # np_entitylen_boson = np.array(boson_entities_info["entity_unicode_len"]) # subJsonObject = {} # subJsonObject["phrase_id"] = phrase_id # # # 统计每个NER中PERSON、LOCATION, ORGANIZATION的个数 # # for i in ["PERSON", "LOCATION", "ORGANIZATION"]: c_person_indexes = find_all_index(np_type_corenlp, "PERSON") c_location_indexes = find_all_index(np_type_corenlp, "LOCATION") c_organization_indexes = find_all_index( np_type_corenlp, "ORGANIZATION") # # for detecting the overlap of entity between corenlp, ltp and hanlp c_person_list = np_entity_corenlp[c_person_indexes] c_location_list = np_entity_corenlp[c_location_indexes] c_organization_list = np_entity_corenlp[c_organization_indexes] # # set # c_person_set = set(np_entity_corenlp[c_person_indexes]) # c_location_set = set(np_entity_corenlp[c_location_indexes]) # c_organization_set = set(np_entity_corenlp[c_organization_indexes]) # for i in ["PERSON", "LOCATION", "ORGANIZATION"]: l_person_indexes = find_all_index(np_type_ltp, "PERSON") l_location_indexes = find_all_index(np_type_ltp, "LOCATION") l_organization_indexes = find_all_index( np_type_ltp, "ORGANIZATION") l_person_list = np_entity_ltp[l_person_indexes] l_location_list = np_entity_ltp[l_location_indexes] l_organization_list = np_entity_ltp[l_organization_indexes] # for i in ["PERSON", "LOCATION", "ORGANIZATION"]: h_person_indexes = find_all_index(np_type_hanlp, "PERSON") h_location_indexes = find_all_index(np_type_hanlp, "LOCATION") h_organization_indexes = find_all_index( np_type_hanlp, "ORGANIZATION") h_person_list = np_entity_hanlp[h_person_indexes] h_location_list = np_entity_hanlp[h_location_indexes] h_organization_list = np_entity_hanlp[h_organization_indexes] # FNLP # for i in ["PERSON", "LOCATION", "ORGANIZATION"]: f_person_indexes = find_all_index(np_type_fnlp, "PERSON") f_location_indexes = find_all_index(np_type_fnlp, "LOCATION") f_organization_indexes = find_all_index( np_type_fnlp, "ORGANIZATION") f_person_list = np_entity_fnlp[f_person_indexes] f_location_list = np_entity_fnlp[f_location_indexes] f_organization_list = np_entity_fnlp[f_organization_indexes] # foolnltk # for i in ["PERSON", "LOCATION", "ORGANIZATION"]: o_person_indexes = find_all_index(np_type_fool, "PERSON") o_location_indexes = find_all_index(np_type_fool, "LOCATION") o_organization_indexes = find_all_index( np_type_fool, "ORGANIZATION") o_person_list = np_entity_fool[o_person_indexes] o_location_list = np_entity_fool[o_location_indexes] o_organization_list = np_entity_fool[o_organization_indexes] # boson_ner # for i in ["PERSON", "LOCATION", "ORGANIZATION"]: b_person_indexes = find_all_index(np_type_boson, "PERSON") b_location_indexes = find_all_index(np_type_boson, "LOCATION") b_organization_indexes = find_all_index( np_type_boson, "ORGANIZATION") b_person_list = np_entity_boson[b_person_indexes] b_location_list = np_entity_boson[b_location_indexes] b_organization_list = np_entity_boson[b_organization_indexes] print phrase print "corenlp" print " " + "person: ", for e in c_person_list: print e, print "" print " " + "organization:", for e in c_organization_list: print e, print "" print "ltp" print " " + "person: ", for e in l_person_list: print e, print "" print " " + "organization:", for e in l_organization_list: print e, print "" print "HanLP" print " " + "person: ", for e in h_person_list: print e, print "" print " " + "organization:", for e in h_organization_list: print e, print "" print "FNLP" print " " + "person: ", for e in f_person_list: print e, print "" print " " + "organization:", for e in f_organization_list: print e, print "" print "foolNLTK" print " " + "person: ", for e in o_person_list: print e, print "" print " " + "organization:", for e in o_organization_list: print e, print "" try: print "BosonNER" print " " + "person: ", for e in b_person_list: print e, print "" print " " + "organization:", for e in b_organization_list: print e, print "" except UnicodeDecodeError, e: print e.message()