def analyse_data(entrys): c_name_map = collections.defaultdict(list) c_name_infoid_list = [] i = 0 j = 0 for i in range(len(entrys)): j = i + 1 name_i = str(entrys[i][COMPANY_NAME]) id_i = str(entrys[i][INFO_ID]) if name_i.strip() == "" or name_i == "None": continue if i % 10 == 0: LOG.info("[%s] has been dealed!"%i) print "[%s:%s]"%(name_i,id_i),id_i + ",", for j in range(len(entrys))[i+1:]: name_j = str(entrys[j][COMPANY_NAME]) if name_j.strip() == "" or name_j == "None": continue id_j = str(entrys[j][INFO_ID]) if name_j in name_i or \ name_i in name_j or \ StrSim.get_sim(name_i,name_j) > 0.8: #print "(%s:%s)"%(name_j,id_j),'\t', print id_j + ",", print
def is_similar(self, columns, new, old): res = True for key in columns: LOG.debug("Comparing[%s],new is [%s],old is [%s]" % (key, new[key], old[key])) if new[key] == old[key]: continue if new[key] is None or old[key] is None: res = False break if new[key] in old[key] or old[key] in new[key]: continue if StrSim.get_sim(str(new[key]), str(old[key])) < THRESHOLD: res = False break return res