Ejemplo n.º 1
0
def analyse_data(entrys):
    c_name_map = collections.defaultdict(list)
    c_name_infoid_list = []
    i = 0
    j = 0
    for i in range(len(entrys)):
        j = i + 1
        name_i = str(entrys[i][COMPANY_NAME])
        id_i = str(entrys[i][INFO_ID])
        if name_i.strip() == "" or name_i == "None":
            continue
        if i % 10 == 0:
            LOG.info("[%s] has been dealed!"%i)
        print "[%s:%s]"%(name_i,id_i),id_i + ",",
        for j in range(len(entrys))[i+1:]:
            name_j = str(entrys[j][COMPANY_NAME])
            if name_j.strip() == "" or name_j == "None":
                continue
            id_j = str(entrys[j][INFO_ID])
            if name_j in name_i or \
                    name_i in name_j or \
                        StrSim.get_sim(name_i,name_j) > 0.8:
                #print "(%s:%s)"%(name_j,id_j),'\t',
                print id_j + ",",
        print
Ejemplo n.º 2
0
    def is_similar(self, columns, new, old):
        res = True
        for key in columns:

            LOG.debug("Comparing[%s],new is [%s],old is [%s]" % (key, new[key], old[key]))
            if new[key] == old[key]:
                continue
            if new[key] is None or old[key] is None:
                res = False
                break
            if new[key] in old[key] or old[key] in new[key]:
                continue
            if StrSim.get_sim(str(new[key]), str(old[key])) < THRESHOLD:
                res = False
                break
        return res