def find_components(con, cur, thresh=0.85, store=True): print('finding firm components') cmd = 'select * from pair' def dmetr(name1, name2): maxlen = max(len(name1), len(name2)) ldist = levenshtein(name1, name2, max_dist=int(ceil(maxlen * (1.0 - thresh)))) return (1.0 - float(ldist) / maxlen) if (ldist != -1 and maxlen != 0) else 0.0 dists = [] close = [] name_dict = {} name_std = {} for (o1, o2, n1, n2) in cur.execute(cmd): if o1 not in name_dict: n1s = name_standardize_strong(n1) name_dict[o1] = n1 name_std[o1] = n1s else: n1s = name_std[o1] if o2 not in name_dict: n2s = name_standardize_strong(n2) name_dict[o2] = n2 name_std[o2] = n2s else: n2s = name_std[o2] d = dmetr(n1s, n2s) dists.append((o1, o2, d)) if d > thresh: close.append((o1, o2)) G = nx.Graph() G.add_edges_from(close) comps = sorted(nx.connected_components(G), key=len, reverse=True) if store: cur.execute('drop table if exists component') cur.execute('create table component (compid int, ownerid int)') cur.executemany( 'insert into component values (?,?)', chain( *[zip(repeat(cid), comp) for (cid, comp) in enumerate(comps)])) con.commit() else: comp_names = [[name_std[id] for id in ids] for ids in comps] return comp_names
def find_components(con,cur,thresh=0.85,store=False): cmd = 'select * from pair' def dmetr(name1,name2): maxlen = max(len(name1),len(name2)) ldist = levenshtein(name1,name2,max_dist=int(ceil(maxlen*(1.0-thresh)))) return (1.0 - float(ldist)/maxlen) if (ldist != -1 and maxlen != 0) else 0.0 dists = [] close = [] name_dict = {} name_std = {} for (o1,o2,n1,n2) in cur.execute(cmd): if o1 not in name_dict: n1s = name_standardize_strong(n1) name_dict[o1] = n1 name_std[o1] = n1s else: n1s = name_std[o1] if o2 not in name_dict: n2s = name_standardize_strong(n2) name_dict[o2] = n2 name_std[o2] = n2s else: n2s = name_std[o2] d = dmetr(n1s,n2s) dists.append((o1,o2,d)) if d > thresh: close.append((o1,o2)) G = nx.Graph() G.add_edges_from(close) comps = sorted(nx.connected_components(G),key=len,reverse=True) if store: cur.execute('drop table if exists component') cur.execute('create table component (compid int, ownerid int)') cur.executemany('insert into component values (?,?)',chain(*[zip(repeat(cid),comp) for (cid,comp) in enumerate(comps)])) con.commit() else: comp_names = [[name_std[id] for id in ids] for ids in comps] return comp_names
con = sqlite3.connect(args.db) cur = con.cursor() # create table cur.execute('drop table if exists assign_use') cur.execute( 'create table assign_use (assignid integer primary key, patnum int, execdate text, recdate text, conveyance text, assignor text, assignee text, assignee_state text, assignee_country text)' ) chunker = ChunkInserter(con, table='assign_use') match_num = 0 rnum = 0 for row in cur.execute('select * from assign'): (assignee, assignor) = (row[5], row[6]) assignor_toks = name_standardize_strong(assignor) assignee_toks = name_standardize_strong(assignee) word_match = 0 for tok in assignor_toks: if tok in assignee_toks: word_match += 1 word_match /= max(1.0, 0.5 * (len(assignor_toks) + len(assignee_toks))) match = word_match > 0.5 chunker.insert(*row) match_num += match rnum += 1
# create table cur_ins.execute('create table assignment_use (assignid integer primary key, patnum int, execdate text, recdate text, conveyance text, assignor text, assignee text, assignee_state text, assignee_country text)') cmd_ins = 'insert into assignment_use values (?,?,?,?,?,?,?,?,?)' # batch insertion batch_size = 10000 assignments = [] rlim = sys.maxsize match_num = 0 rnum = 0 for row in cur.execute('select rowid,* from assignment'): (assignee,assignor) = (row[5],row[6]) assignor_toks = name_standardize_strong(assignor) assignee_toks = name_standardize_strong(assignee) word_match = 0 for tok in assignor_toks: if tok in assignee_toks: word_match += 1 word_match /= max(1.0,0.5*(len(assignor_toks)+len(assignee_toks))) match = word_match > 0.5 # if match: # print('{:7}-{:7}, {:4.2}-{}: {:40.40} -> {:40.40}'.format(rowid,patnum,word_match,int(match),assignor,assignee)) if store: assignments.append(row)