def handle(school): name = school[1] match = [] if name.replace(' ', '').isalpha(): top = 2900110 else: top = 34050 school_id = [k for k, v in SCHOOL_UNIVERSITY.iteritems() if v == name] if not school_id: match = [(i, find_lcs_len(v.encode('utf-8'), name.encode('utf-8'))) for i, v in SCHOOL_UNIVERSITY.iteritems() if i <= top] match = sorted(match, key=lambda x: x[1], reverse=True)[:10] print '\n--------%s--------\n' % name get = getIndex('\n'.join([ '选择:\t' + str(match.index(i)) + ' ' + SCHOOL_UNIVERSITY[i[0]] for i in match ])) if get < 10: school_id = match[get][0] else: school_id = 0 if school_id: print '\n\n++++++%s++++++++' % SCHOOL_UNIVERSITY[school_id] else: school_id = school_id[0] name = school[2] match = [] depDict = defaultdict(str) dep_id = 0 if name.replace(' ', ''): if school_id and type( school_id ) is int and school_id in SCHOOL_UNIVERSITY_DEPARTMENT_ID: for id in SCHOOL_UNIVERSITY_DEPARTMENT_ID[school_id]: depDict[id] = SCHOOL_UNIVERSITY_DEPARTMENT_ID2NAME[id] else: depDict = SCHOOL_UNIVERSITY_DEPARTMENT_ID2NAME dep_id = [] for k, v in depDict.iteritems(): mlen = find_lcs_len(name.encode('utf-8'), v.encode('utf-8')) if mlen > 3: dep_id.append((k, mlen)) dep_id.sort(key=lambda x: x[1]) if not dep_id: dep_id = 0 else: dep_id = dep_id[0][0] print name, SCHOOL_UNIVERSITY_DEPARTMENT_ID2NAME[dep_id] return [school[0], school_id, dep_id]
def handleSchool(school): school_id = [k for k,v in SCHOOL_UNIVERSITY.iteritems() if v==school[1]] dep_id = [ k for k,v in SCHOOL_UNIVERSITY_DEPARTMENT_ID2NAME.iteritems() if v==school[2]] if school_id and dep_id: school_id=school_id[0] dep_id=dep_id[0] return (school[0],school_id,dep_id,school[3],school[4]) MANUAL_LIST.append(school)
def handle(school): name = school[1] match = [] if name.replace(' ', '').isalpha(): top = 2900110 else: top = 34050 school_id = [k for k, v in SCHOOL_UNIVERSITY.iteritems() if v == name] if not school_id: match = [(i, find_lcs_len(v.encode('utf-8'), name.encode('utf-8'))) for i, v in SCHOOL_UNIVERSITY.iteritems() if i <= top] match = sorted(match, key=lambda x:x[1], reverse=True)[:10] print '\n--------%s--------\n'% name get = getIndex('\n'.join(['选择:\t'+str(match.index(i))+' '+SCHOOL_UNIVERSITY[i[0]] for i in match])) if get < 10: school_id = match[get][0] else: school_id = 0 if school_id: print '\n\n++++++%s++++++++'%SCHOOL_UNIVERSITY[school_id] else: school_id = school_id[0] name = school[2] match = [] depDict = defaultdict(str) dep_id = 0 if name.replace(' ', ''): if school_id and type(school_id) is int and school_id in SCHOOL_UNIVERSITY_DEPARTMENT_ID: for id in SCHOOL_UNIVERSITY_DEPARTMENT_ID[school_id]: depDict[id] = SCHOOL_UNIVERSITY_DEPARTMENT_ID2NAME[id] else: depDict = SCHOOL_UNIVERSITY_DEPARTMENT_ID2NAME dep_id = [] for k,v in depDict.iteritems(): mlen = find_lcs_len(name.encode('utf-8'),v.encode('utf-8')) if mlen > 3: dep_id.append((k,mlen)) dep_id.sort(key=lambda x:x[1]) if not dep_id: dep_id = 0 else: dep_id = dep_id[0][0] print name,SCHOOL_UNIVERSITY_DEPARTMENT_ID2NAME[dep_id] return [school[0], school_id, dep_id]
name = name.replace('大学', '大').replace('科学技术', '科').replace( '中国', '中').replace('师范', '师').replace('科技', '科').replace('交通', '交').replace( '财经', '财').replace('工业', '工').replace('北京', '北').replace( '科学', '科').replace('农业', '农').decode('utf-8') if name.endswith(u"大") and len(name) > 2: name = name[:-1] return name with open('to_be_verified', 'r') as to_be_verifyed: data = loads(to_be_verifyed.read()) #shuffle(data) _SCHOOL_UNIVERSITY = dict( (replace_name(v), k) for k, v in SCHOOL_UNIVERSITY.iteritems()) f = open('out2.txt', 'w') err = open('logging2', 'w') fcount = 0 for pos, i in enumerate(data): _name = i[1] name = replace_name(_name) print name if not name: continue c = [] maxlen = 0 for j, id in _SCHOOL_UNIVERSITY.iteritems(): if len(set(name) & set(j)) >= 2:
from random import shuffle def replace_name(name): name = ftoj(name.decode('utf-8')) if type(name) is unicode: name = name.encode('utf-8') name = name.replace('大学', '大').replace('科学技术', '科').replace('中国', '中').replace('师范', '师').replace('科技', '科').replace('交通', '交').replace('财经', '财').replace('工业', '工').replace('北京', '北').replace('科学', '科').replace('农业', '农').decode('utf-8') if name.endswith(u"大") and len(name) > 2: name = name[:-1] return name with open('to_be_verified', 'r') as to_be_verifyed: data = loads(to_be_verifyed.read()) #shuffle(data) _SCHOOL_UNIVERSITY = dict((replace_name(v), k) for k, v in SCHOOL_UNIVERSITY.iteritems()) f = open('out2.txt', 'w') err = open('logging2','w') fcount = 0 for pos, i in enumerate(data): _name = i[1] name = replace_name(_name) print name if not name: continue c = [] maxlen = 0 for j, id in _SCHOOL_UNIVERSITY.iteritems():