def clean_wrapper(name_dict, dict_list=regex_dicts): name_string = psCleanup.decoder(name_dict["person_name"]) out = psCleanup.rem_diacritics(name_string) out = psCleanup.stdize_case(out) out = psCleanup.master_clean_regex([out], dict_list) out = out[0].strip() name_dict["person_name"] = psCleanup.encoder(out) return name_dict
def clean_wrapper(name_dict, dict_list=regex_dicts): name_string = psCleanup.decoder(name_dict['person_name']) out = psCleanup.rem_diacritics(name_string) out = psCleanup.stdize_case(out) out = psCleanup.master_clean_regex([out], dict_list) out = out[0].strip() name_dict['person_name'] = psCleanup.encoder(out) return (name_dict)
this_query = subsequent_query + str(person_id) print 're-establishing connection' print this_query conn_cursor = conn.cursor() conn_cursor.execute(this_query) continue t0 = time.time() person_id, person_name = row ## Check the name validity and make sure that ## we haven't seen the person_id before if len(person_name) > 1 and person_id not in pid_range: person_name = unicode(person_name) pid_range.append(person_id) if len(pid_range) > max_ids: pid_range.pop(0) else: continue clean_person_name = clean_wrapper(person_name, all_regex) clean_person_name = psCleanup.encoder(clean_person_name) writer.writerow([person_id, clean_person_name]) t1 = time.time() time_diff = t1 - t0 total_time += time_diff N += 1 if N % 1000 == 0: print N, total_time, total_time / N conn_cursor.close() conn.close()