## Clean the ipc codes to match those in the PATSTAT output ipc_clean = psCleanup.ipc_clean(green_ipcs['ipc']) green_ipcs['ipc'] = ipc_clean del ipc_clean ## Categorize at the top level the IPC codes green_energy_cats = {} for idx, d in enumerate(green_ipcs.l1): if d in green_energy_cats: green_energy_cats[d].append(green_ipcs.ipc[idx]) else: green_energy_cats[d] = [green_ipcs.ipc[idx]] ## Translate the ipc codes into regex for searching cat_regex = psCleanup.make_regex(green_energy_cats) ## Clean the data country_files = os.listdir('./data') country_files = [f for f in country_files if 'tsv' in f] ## Drop the -no country- file, very very large (32m rows) country_files = [f for f in country_files if ' ' not in f] for idx, f in enumerate(country_files): filename = './data/cleaned_data/' + f print 'Counting patents in ' + filename
with dview.sync_imports(): import psCleanup import psDisambig ## Define the cleaning dicts all_dicts = [psCleanup.convert_html, psCleanup.convert_sgml, psCleanup.clean_symbols, psCleanup.concatenators, psCleanup.single_space, psCleanup.ampersand, psCleanup.us_uk, psCleanup.abbreviations ] regex_dicts = [psCleanup.make_regex(d) for d in all_dicts] dview.push({'all_dicts': all_dicts}) dview.push({'regex_dicts': regex_dicts}) ## Wrap the clean sequence in a useful function @dview.parallel(block=True) def clean_wrapper(name_string, dict_list=regex_dicts): print name_string out = psCleanup.rem_diacritics(name_string) out = psCleanup.stdize_case(out) out = psCleanup.master_clean_regex([out], dict_list) out = out[0].strip() return(out) def clean_list_wrapper(name_list, dict_list=all_dicts): out = [psCleanup.rem_diacritics(n) for n in name_list] out = [psCleanup.stdize_case(n) for n in out]
par_client = Client() dview = par_client[:] dview.block = True # Sync the necessary imports and path settings dview.execute('import sys') dview.execute('sys.path.append("/home/markhuberty/Documents/psClean/code")') with dview.sync_imports(): import psCleanup dview.push({'name_address_dict_list': name_address_dict_list}) dview.push({'coauth_dict_list': coauth_dict_list}) dview.push({'legal_regex': legal_regex}) name_address_regex = [psCleanup.make_regex(d) for d in name_address_dict_list] coauth_regex = [psCleanup.make_regex(d) for d in coauth_dict_list] dview.push({'name_address_regex': name_address_regex}) dview.push({'coauth_dict_list': coauth_regex}) # Set up parallel cleaning wrappers @dview.parallel(block=True) def name_clean_wrapper(name_list, clean_regex=name_address_regex, legal_regex=legal_regex): name_string = psCleanup.decoder(name_list) name_string = psCleanup.remove_diacritics(name_string) name_string = psCleanup.stdize_case(name_string) name_string = psCleanup.master_clean_regex(name_string, clean_regex) names_ids = psCleanup.get_legal_ids(name_string, legal_regex) return names_ids
import numpy as np import scipy.sparse as sp import csv import time ## Define the cleaning dicts all_dicts = [psCleanup.convert_html, psCleanup.convert_sgml, psCleanup.clean_symbols, psCleanup.concatenators, psCleanup.single_space, psCleanup.ampersand, psCleanup.us_uk, psCleanup.abbreviations ] all_regex = [psCleanup.make_regex(d) for d in all_dicts] ## Wrap the clean sequence in a useful function def clean_wrapper(name_string, dict_list): out = psCleanup.rem_diacritics(name_string) out = psCleanup.stdize_case(out) out = psCleanup.master_clean_regex([out], dict_list) out = out[0].strip() return(out) initial_query = """ SELECT person_id, person_name from (tls201_appln INNER JOIN tls207_pers_appln USING(appln_id)) INNER JOIN tls206_person USING(person_id) WHERE YEAR(appln_filing_date) >= 1980 """
## Clean the ipc codes to match those in the PATSTAT output ipc_clean = psCleanup.ipc_clean(green_ipcs['ipc']) green_ipcs['ipc'] = ipc_clean del ipc_clean ## Categorize at the top level the IPC codes green_energy_cats = {} for idx, d in enumerate(green_ipcs.l1): if d in green_energy_cats: green_energy_cats[d].append(green_ipcs.ipc[idx]) else: green_energy_cats[d] = [green_ipcs.ipc[idx]] ## Translate the ipc codes into regex for searching cat_regex = psCleanup.make_regex(green_energy_cats) ## Clean the data country_files = os.listdir('./code') country_files = [f for f in country_files if 'tsv' in f] ## Drop the -no country- file, very very large (32m rows) country_files = [f for f in country_files if ' ' not in f] for idx, f in enumerate(country_files): input_filename = './code/' + f output_filename = './data/cleaned_data/' + f print 'Operating on ' + input_filename
dview.block = True # Sync the necessary imports and path settings dview.execute('import sys') dview.execute( 'sys.path.append("/home/markhuberty/Documents/psClean/code")') with dview.sync_imports(): import psCleanup dview.push({'name_address_dict_list': name_address_dict_list}) dview.push({'coauth_dict_list': coauth_dict_list}) dview.push({'legal_regex': legal_regex}) name_address_regex = [ psCleanup.make_regex(d) for d in name_address_dict_list ] coauth_regex = [psCleanup.make_regex(d) for d in coauth_dict_list] dview.push({'name_address_regex': name_address_regex}) dview.push({'coauth_dict_list': coauth_regex}) # Set up parallel cleaning wrappers @dview.parallel(block=True) def name_clean_wrapper(name_list, clean_regex=name_address_regex, legal_regex=legal_regex): name_string = psCleanup.decoder(name_list) name_string = psCleanup.remove_diacritics(name_string) name_string = psCleanup.stdize_case(name_string) name_string = psCleanup.master_clean_regex(name_string, clean_regex)