def getSample(cur, sample_size, id_column, table): ''' Returns a random sample of a given size of records pairs from a given MySQL table. ''' cur.execute("SELECT MAX(%s) FROM %s" % (id_column, table)) num_records = cur.fetchone().values()[0] # Dedupe expects the id column to contain unique, sequential # integers starting at 0 or 1 random_pairs = dedupe.randomPairs(num_records, sample_size, zero_indexed=False) temp_d = {} cur.execute(DONOR_SELECT) for row in cur.fetchall() : temp_d[int(row[id_column])] = dedupe.core.frozendict(row) def random_pair_generator(): for record_id_1, record_id_2 in random_pairs: yield ((record_id_1, temp_d[record_id_1]), (record_id_2, temp_d[record_id_2])) return tuple(pair for pair in random_pair_generator())
def getSample(cur, sample_size, id_column, table): ''' Returns a random sample of a given size of records pairs from a given MySQL table. ''' cur.execute("SELECT MAX(%s) FROM %s" % (id_column, table)) num_records = cur.fetchone().values()[0] # Dedupe expects the id column to contain unique, sequential # integers starting at 0 or 1 random_pairs = dedupe.randomPairs(num_records, sample_size, zero_indexed=False) temp_d = {} cur.execute(DONOR_SELECT) for row in cur.fetchall(): temp_d[int(row[id_column])] = dedupe.core.frozendict(row) def random_pair_generator(): for k1, k2 in random_pairs: yield (temp_d[k1], temp_d[k2]) return tuple(pair for pair in random_pair_generator())
def getSample(cur, sample_size, id_column, table): ''' Returns a random sample of a given size of records pairs from a given MySQL table. ''' cur.execute("SELECT MAX(%s) FROM %s" % (id_column, table)) num_records = cur.fetchone().values()[0] cur.fetchall() random_pairs = dedupe.randomPairs(num_records, sample_size) random_pairs += 1 temp_d = {} cur.execute(DONOR_SELECT) for row in cur: temp_d[int(row[id_column])] = dedupe.core.frozendict(row) def random_pair_generator(): for k1, k2 in random_pairs: yield (temp_d[k1], temp_d[k2]) return tuple(pair for pair in random_pair_generator())
def getSample(cur, sample_size, id_column, table): ''' Returns a random sample of a given size of records pairs from a given MySQL table. ''' cur.execute("SELECT MAX(%s) FROM %s" % (id_column, table)) num_records = cur.fetchone().values()[0] cur.fetchall() random_pairs = dedupe.randomPairs(num_records, sample_size) random_pairs += 1 temp_d = {} cur.execute(DONOR_SELECT) for row in cur : temp_d[int(row[id_column])] = dedupe.core.frozendict(row) def random_pair_generator(): for k1, k2 in random_pairs: yield (temp_d[k1], temp_d[k2]) return tuple(pair for pair in random_pair_generator())
def getSample(con, sample_size, id_column, table): ''' Returns a random sample of a given size of records pairs from a given PostgresSQL table. ''' cur = con.cursor() cur.execute("SELECT MAX(%s) FROM %s" % (id_column, table)) num_records = cur.fetchall()[0].values()[0] cur.close() random_pairs = dedupe.randomPairs(num_records, sample_size) temp_d = {} # Named cursor runs server side with psycopg2 cur = con.cursor('donor_select') cur.execute(DONOR_SELECT) for i, row in enumerate(cur): temp_d[i] = dedupe.frozendict(row) cur.close() pair_sample = [(temp_d[k1], temp_d[k2]) for k1, k2 in random_pairs] return pair_sample
def getSample(con, size): """ Returns a random sample of pairs of donors of a given size """ dim = con.execute("SELECT MAX(donor_id) FROM donors").next()[0] random_pairs = dedupe.randomPairs(dim, size, zero_indexed=False) all_ids = ', '.join(str(record_id) for pair in random_pairs for record_id in pair) temp_d = {} for row in con.execute(donor_select + " WHERE donor_id IN (%s)" % all_ids) : temp_d[row['donor_id']] = row return tuple((((record_id_1, temp_d[record_id_1]), (record_id_2, temp_d[record_id_2])) for record_id_1, record_id_2 in random_pairs))
def getSample(cur, sample_size, id_column, table): ''' Returns a random sample of a given size of records pairs from a given MySQL table. ''' cur.execute("SELECT MAX(%s) FROM %s" % (id_column, table)) num_records = cur.fetchall()[0].values()[0] random_pairs = dedupe.randomPairs(num_records, sample_size) temp_d = {} cur.execute(DONOR_SELECT) for i, row in enumerate(cur) : temp_d[i] = dedupe.frozendict(row) pair_sample = [(temp_d[k1], temp_d[k2]) for k1, k2 in random_pairs] return pair_sample