コード例 #1
0
def clean_wrapper(name_dict, dict_list=regex_dicts):
    name_string = psCleanup.decoder(name_dict["person_name"])
    out = psCleanup.rem_diacritics(name_string)
    out = psCleanup.stdize_case(out)
    out = psCleanup.master_clean_regex([out], dict_list)
    out = out[0].strip()
    name_dict["person_name"] = psCleanup.encoder(out)
    return name_dict
コード例 #2
0
def clean_wrapper(name_dict, dict_list=regex_dicts):
    name_string = psCleanup.decoder(name_dict['person_name'])
    out = psCleanup.rem_diacritics(name_string)
    out = psCleanup.stdize_case(out)
    out = psCleanup.master_clean_regex([out], dict_list)
    out = out[0].strip()
    name_dict['person_name'] = psCleanup.encoder(out)
    return (name_dict)
コード例 #3
0
            this_query = subsequent_query + str(person_id)
            print 're-establishing connection'
            print this_query
            conn_cursor = conn.cursor()
            conn_cursor.execute(this_query)
            continue
        t0 = time.time()
        person_id, person_name = row
        ## Check the name validity and make sure that
        ## we haven't seen the person_id before
        if len(person_name) > 1 and person_id not in pid_range:
            person_name = unicode(person_name)
            pid_range.append(person_id)
            if len(pid_range) > max_ids:
                pid_range.pop(0)
        else:
            continue
        clean_person_name = clean_wrapper(person_name, all_regex)
        clean_person_name = psCleanup.encoder(clean_person_name)
        writer.writerow([person_id, clean_person_name])
        t1 = time.time()
        time_diff = t1 - t0
        total_time += time_diff
        N += 1
        if N % 1000 == 0:
            print N, total_time, total_time / N


conn_cursor.close()
conn.close()