## Clean the ipc codes to match those in the PATSTAT output
ipc_clean = psCleanup.ipc_clean(green_ipcs['ipc'])
green_ipcs['ipc'] = ipc_clean
del ipc_clean

## Categorize at the top level the IPC codes
green_energy_cats = {}
for idx, d in enumerate(green_ipcs.l1):
    if d in green_energy_cats:
        green_energy_cats[d].append(green_ipcs.ipc[idx])
    else:
        green_energy_cats[d] = [green_ipcs.ipc[idx]]

## Translate the ipc codes into regex for searching
cat_regex = psCleanup.make_regex(green_energy_cats)

## Clean the data

country_files = os.listdir('./data')
country_files = [f for f in country_files if 'tsv' in f]

## Drop the -no country- file, very very large (32m rows)
country_files = [f for f in country_files if ' ' not in f]


for idx, f in enumerate(country_files):
   
    filename = './data/cleaned_data/' + f
    print 'Counting patents in ' + filename
with dview.sync_imports():
    import psCleanup
    import psDisambig


## Define the cleaning dicts
all_dicts = [psCleanup.convert_html,
             psCleanup.convert_sgml,
             psCleanup.clean_symbols,
             psCleanup.concatenators,
             psCleanup.single_space,
             psCleanup.ampersand,
             psCleanup.us_uk,
             psCleanup.abbreviations
             ]
regex_dicts = [psCleanup.make_regex(d) for d in all_dicts]
dview.push({'all_dicts': all_dicts})
dview.push({'regex_dicts': regex_dicts})
## Wrap the clean sequence in a useful function
@dview.parallel(block=True)
def clean_wrapper(name_string, dict_list=regex_dicts):
    print name_string
    out = psCleanup.rem_diacritics(name_string)
    out = psCleanup.stdize_case(out)
    out = psCleanup.master_clean_regex([out], dict_list)
    out = out[0].strip()
    return(out)

def clean_list_wrapper(name_list, dict_list=all_dicts):
    out = [psCleanup.rem_diacritics(n) for n in name_list]
    out = [psCleanup.stdize_case(n) for n in out]
     par_client = Client()
     dview = par_client[:]
     dview.block = True

     # Sync the necessary imports and path settings
     dview.execute('import sys')
     dview.execute('sys.path.append("/home/markhuberty/Documents/psClean/code")')

     with dview.sync_imports():
          import psCleanup

     dview.push({'name_address_dict_list': name_address_dict_list})
     dview.push({'coauth_dict_list': coauth_dict_list})
     dview.push({'legal_regex': legal_regex})

     name_address_regex = [psCleanup.make_regex(d) for d in name_address_dict_list]
     coauth_regex = [psCleanup.make_regex(d) for d in coauth_dict_list]

     dview.push({'name_address_regex': name_address_regex})
     dview.push({'coauth_dict_list': coauth_regex})

     # Set up parallel cleaning wrappers
     @dview.parallel(block=True)
     def name_clean_wrapper(name_list, clean_regex=name_address_regex, legal_regex=legal_regex):
          name_string = psCleanup.decoder(name_list)
          name_string = psCleanup.remove_diacritics(name_string)
          name_string = psCleanup.stdize_case(name_string)
          name_string = psCleanup.master_clean_regex(name_string, clean_regex)
          names_ids = psCleanup.get_legal_ids(name_string, legal_regex)
          return names_ids
     
import numpy as np
import scipy.sparse as sp
import csv
import time

## Define the cleaning dicts
all_dicts = [psCleanup.convert_html,
             psCleanup.convert_sgml,
             psCleanup.clean_symbols,
             psCleanup.concatenators,
             psCleanup.single_space,
             psCleanup.ampersand,
             psCleanup.us_uk,
             psCleanup.abbreviations
             ]
all_regex = [psCleanup.make_regex(d) for d in all_dicts]
## Wrap the clean sequence in a useful function
def clean_wrapper(name_string, dict_list):
    out = psCleanup.rem_diacritics(name_string)
    out = psCleanup.stdize_case(out)
    out = psCleanup.master_clean_regex([out], dict_list)
    out = out[0].strip()
    return(out)

initial_query = """
SELECT person_id, person_name from (tls201_appln INNER JOIN
tls207_pers_appln USING(appln_id)) INNER JOIN
tls206_person USING(person_id)
WHERE YEAR(appln_filing_date) >= 1980
"""
## Clean the ipc codes to match those in the PATSTAT output
ipc_clean = psCleanup.ipc_clean(green_ipcs['ipc'])
green_ipcs['ipc'] = ipc_clean
del ipc_clean

## Categorize at the top level the IPC codes
green_energy_cats = {}
for idx, d in enumerate(green_ipcs.l1):
    if d in green_energy_cats:
        green_energy_cats[d].append(green_ipcs.ipc[idx])
    else:
        green_energy_cats[d] = [green_ipcs.ipc[idx]]

## Translate the ipc codes into regex for searching
cat_regex = psCleanup.make_regex(green_energy_cats)

## Clean the data

country_files = os.listdir('./code')
country_files = [f for f in country_files if 'tsv' in f]

## Drop the -no country- file, very very large (32m rows)
country_files = [f for f in country_files if ' ' not in f]

for idx, f in enumerate(country_files):
    input_filename = './code/' + f
    output_filename = './data/cleaned_data/' + f

    print 'Operating on ' + input_filename
Esempio n. 6
0
    dview.block = True

    # Sync the necessary imports and path settings
    dview.execute('import sys')
    dview.execute(
        'sys.path.append("/home/markhuberty/Documents/psClean/code")')

    with dview.sync_imports():
        import psCleanup

    dview.push({'name_address_dict_list': name_address_dict_list})
    dview.push({'coauth_dict_list': coauth_dict_list})
    dview.push({'legal_regex': legal_regex})

    name_address_regex = [
        psCleanup.make_regex(d) for d in name_address_dict_list
    ]
    coauth_regex = [psCleanup.make_regex(d) for d in coauth_dict_list]

    dview.push({'name_address_regex': name_address_regex})
    dview.push({'coauth_dict_list': coauth_regex})

    # Set up parallel cleaning wrappers
    @dview.parallel(block=True)
    def name_clean_wrapper(name_list,
                           clean_regex=name_address_regex,
                           legal_regex=legal_regex):
        name_string = psCleanup.decoder(name_list)
        name_string = psCleanup.remove_diacritics(name_string)
        name_string = psCleanup.stdize_case(name_string)
        name_string = psCleanup.master_clean_regex(name_string, clean_regex)