def name_clean_wrapper(name_list, clean_regex=name_address_regex, legal_regex=legal_regex): name_string = psCleanup.decoder(name_list) name_string = psCleanup.remove_diacritics(name_string) name_string = psCleanup.stdize_case(name_string) name_string = psCleanup.master_clean_regex(name_string, clean_regex) names_ids = psCleanup.get_legal_ids(name_string, legal_regex) return names_ids
ipc_output = myquery(ipc_extract, db, ipc_colnames) ipc_output = ipc_output.set_index('appln_id') print time.strftime('%c', time.localtime()) print 'Query complete, starting cleaning' print 'Number of records returned:' print len(name_output), len(ipc_output) name_clean_time = time.time() ## Clean names, separate legal ids, and re-insert names = psCleanup.name_clean(name_output['person_name'], psCleanup.name_address_dict_list) names_ids = [ psCleanup.get_legal_ids(n, psCleanup.legal_regex) for n in names ] name_output['person_name'], name_output['firm_legal_id'] = zip(*names_ids) name_output['person_address'] = psCleanup.name_clean( name_output['person_address'], psCleanup.name_address_dict_list) print time.strftime('%c', time.localtime()) print 'Names clean' ## ID the coauthors and join coauthor_list = [] for appln_id, person_id in zip(name_output['appln_id'], name_output['person_id']): coauthors = name_output['person_name'][ (name_output['appln_id'] == appln_id) & (name_output['person_id'] != person_id)]
def tuple_clean(query_output): """ Cleans, formats, outputs the query data. Collects summary statistics per country: number of records, number of nonblank address lines and average number of coauthors and ipc codes per country. Args: query_output: tuple of unformated person_appln tuples Returns: Files of cleaned person_appln rows written out by country. File of summary statistics written out one row per country. """ auth_patent_n = len(query_output) addresses_n = 0 coauths = list() ipc = list() name_clean_time = time.time() names = [q[2] for q in query_output] names = psCleanup.name_clean(names, psCleanup.name_address_dict_list) names_ids = [psCleanup.get_legal_ids(n, psCleanup.legal_regex) for n in names] addresses = [q[3] for q in query_output] addresses = psCleanup.name_clean(addresses, psCleanup.name_address_dict_list) coauthors = [q[5] for q in query_output] coauthors = psCleanup.name_clean(coauthors, psCleanup.coauth_dict_list) name_clean_finish = time.time() print 'Cleaning time per name + address' print (name_clean_finish - name_clean_time) / float(len(names)) for idx, record in enumerate(query_output): clean_time_start = time.time() ## Unpack the tuple appln_id, person_id, person_name, person_address, person_ctry_code, \ coauth, ipc_codes = record ## Separate out the authors and ipcs for cleaning coauthors_split = coauthors[idx].split('**') ipc_split = ipc_codes.split('**') ## Drop the co-author that is this author clean_coauthors = [name for name in coauthors_split if name != person_name] ## Generate some summary statistics addresses_n += len(person_address) > 0 coauths.append(len(clean_coauthors)) ipc.append(len(ipc_split)) appln_id = str(appln_id) person_id = str(person_id) ## Clean the person name, then break out the ## legal identifiers preclean_time = time.time() ## print preclean_time - clean_time_start # raw_name = psCleanup.name_clean([person_name])[0] clean_name, firm_legal_ids = names_ids[idx] # intermediate_clean_time = time.time() # print intermediate_clean_time - clean_time_start clean_ipcs = psCleanup.ipc_clean(ipc_split) # intermediate_clean_time_2 = time.time() # print intermediate_clean_time_2 - intermediate_clean_time coauthors_final = psCleanup.get_max(clean_coauthors) ipc_codes_final = psCleanup.get_max(clean_ipcs) legal_ids_final = psCleanup.get_max([firm_legal_ids]) clean_time_end = time.time() print appln_id, person_id, clean_name, legal_ids_final, addresses[idx], person_ctry_code, coauthors_final, ipc_codes_final print 'Record clean time:' print clean_time_end - clean_time_start # filename = outpathname + record[4]+'_out' # with open(filename, 'a') as tabfile: # cleanwriter = csv.writer(tabfile, delimiter ='\t') # cleanwriter.writerow(appln_id, # person_id, # clean_name, # addresses[idx], # legal_ids_final, # person_ctry_code, # coauthors_final, # ipc_codes_final, # year # ) # coauth_mean = numpy.mean(coauths) # ipc_mean = numpy.mean(ipc) # with open(outpathname+'summary_stats', 'a') as csvfile: # statswriter = csv.writer(csvfile) # statswriter.writerow([year, auth_patent_n, addresses_n, coauth_mean, ipc_mean]) return None
name_output = myquery(name_extract, db, name_colnames) ipc_output = myquery(ipc_extract, db, ipc_colnames) ipc_output = ipc_output.set_index('appln_id') print time.strftime('%c', time.localtime()) print 'Query complete, starting cleaning' print 'Number of records returned:' print len(name_output), len(ipc_output) name_clean_time = time.time() ## Clean names, separate legal ids, and re-insert names = psCleanup.name_clean(name_output['person_name'], psCleanup.name_address_dict_list) names_ids = [psCleanup.get_legal_ids(n, psCleanup.legal_regex) for n in names] name_output['person_name'], name_output['firm_legal_id'] = zip(*names_ids) name_output['person_address'] = psCleanup.name_clean(name_output['person_address'], psCleanup.name_address_dict_list ) print time.strftime('%c', time.localtime()) print 'Names clean' ## ID the coauthors and join coauthor_list = [] for appln_id, person_id in zip(name_output['appln_id'], name_output['person_id']): coauthors = name_output['person_name'][(name_output['appln_id'] == appln_id) & (name_output['person_id'] != person_id) ] coauthor_list.append(psCleanup.get_max(coauthors))