class Extract: def __init__(self, jieba=False): import re self.de = re.compile(u"[\u4e00-\u9fa5]") self.jieba = jieba self.relation = { u'fuqin': ('PERSON', 'PERSON'), u'erzi': ('PERSON', 'PERSON'), u'nver': ('PERSON', 'PERSON'), u'nvyou': ('PERSON', 'PERSON'), u'nanyou': ('PERSON', 'PERSON'), u'muqin': ('PERSON', 'PERSON'), u'emma': ('PERSON', 'PERSON'), u'zhangfu': ('PERSON', 'PERSON'), u'qizi': ('PERSON', 'PERSON'), u'\u5973\u53cb': ('PERSON', 'PERSON'), u'\u5973\u513f': ('PERSON', 'PERSON'), u'\u59bb\u5b50': ('PERSON', 'PERSON'), u'\u4e08\u592b': ('PERSON', 'PERSON'), u'\u524d\u592b': ('PERSON', 'PERSON'), u'\u7236\u4eb2': ('PERSON', 'PERSON'), u'\u8eab\u9ad8': ('PERSON', 'HEIGHT'), u'\u751f\u65e5': ('PERSON', 'DATE'), u'\u64ad\u51fa\u65f6\u95f4': ('FILM', 'TIME'), u'\u4e3b\u9898\u66f2': ('FILM', 'MUSIC') } self.pos_tagger = { 'a': 0, 'ad': 1, 'ag': 2, 'an': 3, 'b': 4, 'bg': 5, 'c': 6, 'd': 7, 'df': 8, 'dg': 9, 'e': 10, 'en': 11, 'f': 12, 'g': 13, 'h': 14, 'i': 15, 'in': 16, 'j': 17, 'jn': 18, 'k': 19, 'l': 20, 'ln': 21, 'm': 22, 'mg': 23, 'mq': 24, 'n': 25, 'ng': 26, 'nr': 27, 'nrfg': 28, 'nrt': 29, 'ns': 30, 'nt': 31, 'nz': 32, 'o': 33, 'p': 34, 'q': 35, 'qe': 36, 'qg': 37, 'r': 38, 'rg': 39, 'rr': 40, 'rz': 41, 's': 42, 't': 43, 'tg': 44, 'u': 45, 'ud': 46, 'ug': 47, 'uj': 48, 'ul': 49, 'uv': 50, 'uz': 51, 'v': 52, 'vd': 53, 'vg': 54, 'vi': 55, 'vn': 56, 'vq': 57, 'w': 58, 'x': 59, 'y': 60, 'yg': 61, 'z': 62, 'zg': 63, 'a': 64, 'ad': 65, 'ag': 66, 'an': 67, 'b': 68, 'bg': 69, 'c': 70, 'd': 71, 'df': 72, 'dg': 73, 'e': 74, 'en': 75, 'f': 76, 'g': 77, 'h': 78, 'i': 79, 'in': 80, 'j': 81, 'jn': 82, 'k': 83, 'l': 84, 'ln': 85, 'm': 86, 'mg': 87, 'mq': 88, 'n': 89, 'ng': 90, 'nr': 91, 'nrfg': 92, 'nrt': 93, 'ns': 94, 'nt': 95, 'nz': 96, 'o': 97, 'p': 98, 'q': 99, 'qe': 100, 'qg': 101, 'r': 102, 'rg': 103, 'rr': 104, 'rz': 105, 's': 106, 't': 107, 'tg': 108, 'u': 109, 'ud': 110, 'ug': 111, 'uj': 112, 'ul': 113, 'uv': 114, 'uz': 115, 'v': 116, 'vd': 117, 'vg': 118, 'vi': 119, 'vn': 120, 'vq': 121, 'w': 122, 'x': 123, 'y': 124, 'yg': 125, 'z': 126, 'zg': 127, 'a': 128, 'ad': 129, 'ag': 130, 'an': 131, 'b': 132, 'bg': 133, 'c': 134, 'd': 135, 'df': 136, 'dg': 137, 'e': 138, 'en': 139, 'f': 140, 'g': 141, 'h': 142, 'i': 143, 'in': 144, 'j': 145, 'jn': 146, 'k': 147, 'l': 148, 'ln': 149, 'm': 150, 'mg': 151, 'mq': 152, 'n': 153, 'ng': 154, 'nr': 155, 'nrfg': 156, 'nrt': 157, 'ns': 158, 'nt': 159, 'nz': 160, 'o': 161, 'p': 162, 'q': 163, 'qe': 164, 'qg': 165, 'r': 166, 'rg': 167, 'rr': 168, 'rz': 169, 's': 170, 't': 171, 'tg': 172, 'u': 173, 'ud': 174, 'ug': 175, 'uj': 176, 'ul': 177, 'uv': 178, 'uz': 179, 'v': 180, 'vd': 181, 'vg': 182, 'vi': 183, 'vn': 184, 'vq': 185, 'w': 186, 'x': 187, 'y': 188, 'yg': 189, 'z': 190, 'zg': 191, 'a': 192, 'ad': 193, 'ag': 194, 'an': 195, 'b': 196, 'bg': 197, 'c': 198, 'd': 199, 'df': 200, 'dg': 201, 'e': 202, 'en': 203, 'f': 204, 'g': 205, 'h': 206, 'i': 207, 'in': 208, 'j': 209, 'jn': 210, 'k': 211, 'l': 212, 'ln': 213, 'm': 214, 'mg': 215, 'mq': 216, 'n': 217, 'ng': 218, 'nr': 219, 'nrfg': 220, 'nrt': 221, 'ns': 222, 'nt': 223, 'nz': 224, 'o': 225, 'p': 226, 'q': 227, 'qe': 228, 'qg': 229, 'r': 230, 'rg': 231, 'rr': 232, 'rz': 233, 's': 234, 't': 235, 'tg': 236, 'u': 237, 'ud': 238, 'ug': 239, 'uj': 240, 'ul': 241, 'uv': 242, 'uz': 243, 'v': 244, 'vd': 245, 'vg': 246, 'vi': 247, 'vn': 248, 'vq': 249, 'w': 250, 'x': 251, 'y': 252, 'yg': 253, 'z': 254, 'zg': 255, 'eng': 256 } self.m = Merge(True, False) #self.m = Merge(True,True) pass #get the ner using merge and search the relation's Ner def _process_data(self, lines, newwords, n2, tags=None): s = [] p = [] _seg = [] _ner = [] self.m.add_new_words(newwords) if n2 is not None: self.m.add_new_words(n2) for i in xrange(len(lines)): line = lines[i] (line_seg, line_pos, line_ner) = self.m.ner_using_nlpc(line) #(line_ner,line_pos,line_seg,line_dep) = self.m.get_line_info(line,False) if tags is not None: tag = tags[i] k = line_ner.count((self.relation[tag.decode('utf-8')])[1]) if k == 0: continue elif k == 1: if (self.relation[tag.decode('utf-8')])[1] == ( self.relation[tag.decode('utf-8')])[1]: continue else: return seg = line_seg.split('\t') pos = line_pos.split('\t') ner = line_ner.split('\t') s.append(newwords[i][0].decode('utf-8')) p.append(tag) _seg.append(seg) _ner.append(ner) return (s, p, _seg, _ner) def statistics(self, newwords, tags, segs, ners): s = [] p = [] answer = [] fromline = [] for i in xrange(len(tags)): tag = tags[i] seg = segs[i] ner = ners[i] _a = [] print ' '.join(seg).encode('utf-8') for id in xrange(len(seg)): if tags is not None: if ner[id] == (self.relation[tag.decode('utf-8')])[1]: ll = len(self.de.findall(seg[id])) if ll == 0: ll = len(seg[id]) if (seg[id] != newwords[i]) and (seg[id] not in _a) and ( ll > 1) and seg[id].isdigit() == False: print newwords[i].encode( 'utf-8') + ',' + tag.encode( 'utf-8') + ',' + seg[id].encode('utf-8') _a.append(seg[id]) answer.append(seg[id]) s.append(newwords[i]) fromline.append(''.join(seg)) p.append(tag) dict = collections.OrderedDict() for i in xrange(len(s)): s[i] = s[i].decode('utf-8') spo = s[i] + p[i] + answer[i] if spo in dict: dict[spo][2] += 1 else: dict[spo] = [] dict[spo].append(s[i] + '\t' + p[i]) dict[spo].append(answer[i]) dict[spo].append(1) dict[spo].append(fromline[i]) #result = {'sp':[[answer,count,line]]} result = collections.OrderedDict() for (k, v) in dict.items(): sp = v[0] if sp in result: if v[2] > result[sp][0][1]: result[sp] = [] ddd = [] ddd.append(v[1]) ddd.append(v[2]) ddd.append(v[3]) result[sp].append(ddd) elif v[2] == result[sp][0][1]: ddd = [] ddd.append(v[1]) ddd.append(v[2]) ddd.append(v[3]) result[sp].append(ddd) else: result[sp] = [] ddd = [] ddd.append(v[1]) ddd.append(v[2]) ddd.append(v[3]) result[sp].append(ddd) list = [] for (k, v) in result.items(): for i in xrange(len(v)): value = v[i] if value[1] == 1: list.append(k + '\t' + value[0] + '\t' + 'not sure' + '\t' + value[2]) else: list.append(k + '\t' + value[0] + '\t' + str(value[1]) + '\t' + value[2]) return list def test3(self): lines = [] tags = [] newwords = [] newwords2 = [] for line in sys.stdin: try: line = line.split('\t') if len(line) < 5: print 'read wrong:' + '\t'.join(line) continue tags.append(line[1]) newwords.append( (line[0], (self.relation[line[1].decode('utf-8')])[0])) newwords2.append( (line[2], (self.relation[line[1].decode('utf-8')])[0])) if line[4].strip() != '': lines.append(line[4].strip()) else: print 'read wrong:' + '\t'.join(line) except: print 'read wrong:' + '\t'.join(line) (s, p, _seg, _ner) = self._process_data(lines, newwords, None, tags=tags) list = self.statistics(s, p, _seg, _ner) for l in list: print l.encode('utf-8') def test2(self): lines = [] tags = [] newwords = [] newwords2 = [] for line in sys.stdin: try: line = line.split('\t') if len(line) < 6: print 'read wrong:' + '\t'.join(line) continue tags.append(line[1]) newwords.append( (line[0], (self.relation[line[1].decode('utf-8')])[0])) newwords2.append( (line[2], (self.relation[line[1].decode('utf-8')])[0])) if line[5].strip() != '': lines.append(line[5].strip()) else: print 'read wrong:' + '\t'.join(line) except: print 'read wrong:' + '\t'.join(line) (s, p, _seg, _ner) = self._process_data(lines, newwords, newwords2, tags=tags) list = self.statistics(s, p, _seg, _ner) for l in list: print l.encode('utf-8') def test1(self): lines = [] tags = [] newwords = [] newwords2 = [] ss = '' pstr = '' anstr = '' check = False wf = open('result_fanhua_1', 'ab') current = 0 all = 0 for line in sys.stdin: line = line.split('\t') if len(line) < 5: print 'read wrong:' + '\t'.join(line) continue if ss != line[0] and check: (s, p, _seg, _ner) = self._process_data(lines, newwords, newwords2, tags=tags) list = self.statistics(s, p, _seg, _ner) wf.write(ss + '\t' + pstr + '\t' + anstr + '\n') all += 1 for l in list: print 'result' + l wf.write(l + "\n\n") if l.split('\t')[2] == anstr: current += 1 print current lines = [] tags = [] newwords = [] newwords2 = [] ss = line[0] pstr = line[1] anstr = line[2] tags.append(line[1]) newwords.append( (line[0], (self.relation[line[1].decode('utf-8')])[0])) newwords2.append( (line[2], (self.relation[line[1].decode('utf-8')])[0])) if line[4].strip() != '': check = True lines.append(line[4].strip()) else: print 'read wrong:' + '\t'.join(line) wf.write('all' + str(all)) wf.write('current' + str(current)) wf.close() def test(self): lines = [] tags = [] newwords = [] for line in sys.stdin: try: line = line.split(' \t') tags.append(line[0].split('\t')[1]) newwords.append((line[0].split('\t')[0], ( self.relation[line[0].split('\t')[1].decode('utf-8')])[0])) lines.append(line[1].strip()) except: print line quit() (s, p, answer) = self._process_data(lines, newwords, None, tags=tags) list = self.statistics(s, p, answer) for l in list: print l.encode('utf-8')
class Extract: def __init__(self, jieba=False): import re self.de = re.compile(u"[\u4e00-\u9fa5]") self.jieba = jieba self.relation = { u"fuqin": ("PERSON", "PERSON"), u"erzi": ("PERSON", "PERSON"), u"nver": ("PERSON", "PERSON"), u"nvyou": ("PERSON", "PERSON"), u"nanyou": ("PERSON", "PERSON"), u"muqin": ("PERSON", "PERSON"), u"emma": ("PERSON", "PERSON"), u"zhangfu": ("PERSON", "PERSON"), u"qizi": ("PERSON", "PERSON"), u"\u5973\u53cb": ("PERSON", "PERSON"), u"\u5973\u513f": ("PERSON", "PERSON"), u"\u59bb\u5b50": ("PERSON", "PERSON"), u"\u4e08\u592b": ("PERSON", "PERSON"), u"\u524d\u592b": ("PERSON", "PERSON"), u"\u7236\u4eb2": ("PERSON", "PERSON"), u"\u8eab\u9ad8": ("PERSON", "HEIGHT"), u"\u751f\u65e5": ("PERSON", "DATE"), u"\u64ad\u51fa\u65f6\u95f4": ("FILM", "TIME"), u"\u4e3b\u9898\u66f2": ("FILM", "MUSIC"), } self.pos_tagger = { "a": 0, "ad": 1, "ag": 2, "an": 3, "b": 4, "bg": 5, "c": 6, "d": 7, "df": 8, "dg": 9, "e": 10, "en": 11, "f": 12, "g": 13, "h": 14, "i": 15, "in": 16, "j": 17, "jn": 18, "k": 19, "l": 20, "ln": 21, "m": 22, "mg": 23, "mq": 24, "n": 25, "ng": 26, "nr": 27, "nrfg": 28, "nrt": 29, "ns": 30, "nt": 31, "nz": 32, "o": 33, "p": 34, "q": 35, "qe": 36, "qg": 37, "r": 38, "rg": 39, "rr": 40, "rz": 41, "s": 42, "t": 43, "tg": 44, "u": 45, "ud": 46, "ug": 47, "uj": 48, "ul": 49, "uv": 50, "uz": 51, "v": 52, "vd": 53, "vg": 54, "vi": 55, "vn": 56, "vq": 57, "w": 58, "x": 59, "y": 60, "yg": 61, "z": 62, "zg": 63, "a": 64, "ad": 65, "ag": 66, "an": 67, "b": 68, "bg": 69, "c": 70, "d": 71, "df": 72, "dg": 73, "e": 74, "en": 75, "f": 76, "g": 77, "h": 78, "i": 79, "in": 80, "j": 81, "jn": 82, "k": 83, "l": 84, "ln": 85, "m": 86, "mg": 87, "mq": 88, "n": 89, "ng": 90, "nr": 91, "nrfg": 92, "nrt": 93, "ns": 94, "nt": 95, "nz": 96, "o": 97, "p": 98, "q": 99, "qe": 100, "qg": 101, "r": 102, "rg": 103, "rr": 104, "rz": 105, "s": 106, "t": 107, "tg": 108, "u": 109, "ud": 110, "ug": 111, "uj": 112, "ul": 113, "uv": 114, "uz": 115, "v": 116, "vd": 117, "vg": 118, "vi": 119, "vn": 120, "vq": 121, "w": 122, "x": 123, "y": 124, "yg": 125, "z": 126, "zg": 127, "a": 128, "ad": 129, "ag": 130, "an": 131, "b": 132, "bg": 133, "c": 134, "d": 135, "df": 136, "dg": 137, "e": 138, "en": 139, "f": 140, "g": 141, "h": 142, "i": 143, "in": 144, "j": 145, "jn": 146, "k": 147, "l": 148, "ln": 149, "m": 150, "mg": 151, "mq": 152, "n": 153, "ng": 154, "nr": 155, "nrfg": 156, "nrt": 157, "ns": 158, "nt": 159, "nz": 160, "o": 161, "p": 162, "q": 163, "qe": 164, "qg": 165, "r": 166, "rg": 167, "rr": 168, "rz": 169, "s": 170, "t": 171, "tg": 172, "u": 173, "ud": 174, "ug": 175, "uj": 176, "ul": 177, "uv": 178, "uz": 179, "v": 180, "vd": 181, "vg": 182, "vi": 183, "vn": 184, "vq": 185, "w": 186, "x": 187, "y": 188, "yg": 189, "z": 190, "zg": 191, "a": 192, "ad": 193, "ag": 194, "an": 195, "b": 196, "bg": 197, "c": 198, "d": 199, "df": 200, "dg": 201, "e": 202, "en": 203, "f": 204, "g": 205, "h": 206, "i": 207, "in": 208, "j": 209, "jn": 210, "k": 211, "l": 212, "ln": 213, "m": 214, "mg": 215, "mq": 216, "n": 217, "ng": 218, "nr": 219, "nrfg": 220, "nrt": 221, "ns": 222, "nt": 223, "nz": 224, "o": 225, "p": 226, "q": 227, "qe": 228, "qg": 229, "r": 230, "rg": 231, "rr": 232, "rz": 233, "s": 234, "t": 235, "tg": 236, "u": 237, "ud": 238, "ug": 239, "uj": 240, "ul": 241, "uv": 242, "uz": 243, "v": 244, "vd": 245, "vg": 246, "vi": 247, "vn": 248, "vq": 249, "w": 250, "x": 251, "y": 252, "yg": 253, "z": 254, "zg": 255, "eng": 256, } self.m = Merge(True, False) # self.m = Merge(True,True) pass # get the ner using merge and search the relation's Ner def _process_data(self, lines, newwords, n2, tags=None): s = [] p = [] _seg = [] _ner = [] self.m.add_new_words(newwords) if n2 is not None: self.m.add_new_words(n2) for i in xrange(len(lines)): line = lines[i] (line_seg, line_pos, line_ner) = self.m.ner_using_nlpc(line) # (line_ner,line_pos,line_seg,line_dep) = self.m.get_line_info(line,False) if tags is not None: tag = tags[i] k = line_ner.count((self.relation[tag.decode("utf-8")])[1]) if k == 0: continue elif k == 1: if (self.relation[tag.decode("utf-8")])[1] == (self.relation[tag.decode("utf-8")])[1]: continue else: return seg = line_seg.split("\t") pos = line_pos.split("\t") ner = line_ner.split("\t") s.append(newwords[i][0].decode("utf-8")) p.append(tag) _seg.append(seg) _ner.append(ner) return (s, p, _seg, _ner) def statistics(self, newwords, tags, segs, ners): s = [] p = [] answer = [] fromline = [] for i in xrange(len(tags)): tag = tags[i] seg = segs[i] ner = ners[i] _a = [] print " ".join(seg).encode("utf-8") for id in xrange(len(seg)): if tags is not None: if ner[id] == (self.relation[tag.decode("utf-8")])[1]: ll = len(self.de.findall(seg[id])) if ll == 0: ll = len(seg[id]) if (seg[id] != newwords[i]) and (seg[id] not in _a) and (ll > 1) and seg[id].isdigit() == False: print newwords[i].encode("utf-8") + "," + tag.encode("utf-8") + "," + seg[id].encode( "utf-8" ) _a.append(seg[id]) answer.append(seg[id]) s.append(newwords[i]) fromline.append("".join(seg)) p.append(tag) dict = collections.OrderedDict() for i in xrange(len(s)): s[i] = s[i].decode("utf-8") spo = s[i] + p[i] + answer[i] if spo in dict: dict[spo][2] += 1 else: dict[spo] = [] dict[spo].append(s[i] + "\t" + p[i]) dict[spo].append(answer[i]) dict[spo].append(1) dict[spo].append(fromline[i]) # result = {'sp':[[answer,count,line]]} result = collections.OrderedDict() for (k, v) in dict.items(): sp = v[0] if sp in result: if v[2] > result[sp][0][1]: result[sp] = [] ddd = [] ddd.append(v[1]) ddd.append(v[2]) ddd.append(v[3]) result[sp].append(ddd) elif v[2] == result[sp][0][1]: ddd = [] ddd.append(v[1]) ddd.append(v[2]) ddd.append(v[3]) result[sp].append(ddd) else: result[sp] = [] ddd = [] ddd.append(v[1]) ddd.append(v[2]) ddd.append(v[3]) result[sp].append(ddd) list = [] for (k, v) in result.items(): for i in xrange(len(v)): value = v[i] if value[1] == 1: list.append(k + "\t" + value[0] + "\t" + "not sure" + "\t" + value[2]) else: list.append(k + "\t" + value[0] + "\t" + str(value[1]) + "\t" + value[2]) return list def test3(self): lines = [] tags = [] newwords = [] newwords2 = [] for line in sys.stdin: try: line = line.split("\t") if len(line) < 5: print "read wrong:" + "\t".join(line) continue tags.append(line[1]) newwords.append((line[0], (self.relation[line[1].decode("utf-8")])[0])) newwords2.append((line[2], (self.relation[line[1].decode("utf-8")])[0])) if line[4].strip() != "": lines.append(line[4].strip()) else: print "read wrong:" + "\t".join(line) except: print "read wrong:" + "\t".join(line) (s, p, _seg, _ner) = self._process_data(lines, newwords, None, tags=tags) list = self.statistics(s, p, _seg, _ner) for l in list: print l.encode("utf-8") def test2(self): lines = [] tags = [] newwords = [] newwords2 = [] for line in sys.stdin: try: line = line.split("\t") if len(line) < 6: print "read wrong:" + "\t".join(line) continue tags.append(line[1]) newwords.append((line[0], (self.relation[line[1].decode("utf-8")])[0])) newwords2.append((line[2], (self.relation[line[1].decode("utf-8")])[0])) if line[5].strip() != "": lines.append(line[5].strip()) else: print "read wrong:" + "\t".join(line) except: print "read wrong:" + "\t".join(line) (s, p, _seg, _ner) = self._process_data(lines, newwords, newwords2, tags=tags) list = self.statistics(s, p, _seg, _ner) for l in list: print l.encode("utf-8") def test1(self): lines = [] tags = [] newwords = [] newwords2 = [] ss = "" pstr = "" anstr = "" check = False wf = open("result_fanhua_1", "ab") current = 0 all = 0 for line in sys.stdin: line = line.split("\t") if len(line) < 5: print "read wrong:" + "\t".join(line) continue if ss != line[0] and check: (s, p, _seg, _ner) = self._process_data(lines, newwords, newwords2, tags=tags) list = self.statistics(s, p, _seg, _ner) wf.write(ss + "\t" + pstr + "\t" + anstr + "\n") all += 1 for l in list: print "result" + l wf.write(l + "\n\n") if l.split("\t")[2] == anstr: current += 1 print current lines = [] tags = [] newwords = [] newwords2 = [] ss = line[0] pstr = line[1] anstr = line[2] tags.append(line[1]) newwords.append((line[0], (self.relation[line[1].decode("utf-8")])[0])) newwords2.append((line[2], (self.relation[line[1].decode("utf-8")])[0])) if line[4].strip() != "": check = True lines.append(line[4].strip()) else: print "read wrong:" + "\t".join(line) wf.write("all" + str(all)) wf.write("current" + str(current)) wf.close() def test(self): lines = [] tags = [] newwords = [] for line in sys.stdin: try: line = line.split(" \t") tags.append(line[0].split("\t")[1]) newwords.append((line[0].split("\t")[0], (self.relation[line[0].split("\t")[1].decode("utf-8")])[0])) lines.append(line[1].strip()) except: print line quit() (s, p, answer) = self._process_data(lines, newwords, None, tags=tags) list = self.statistics(s, p, answer) for l in list: print l.encode("utf-8")