def firm_statistics(output): print('Finding firm statistics') # firm history statistics firmyear = read_csv(f'{output}/firmyear_info.csv', usecols=['firm_num', 'year', 'n_grant']) firm_groups = firmyear.groupby('firm_num') firm_life = pd.DataFrame({ 'year_min': firm_groups['year'].min(), 'year_max': firm_groups['year'].max(), 'tot_pats': firm_groups['n_grant'].sum() }) firm_life['tot_pats'] = firm_life['tot_pats'].fillna(0).astype(np.int) firm_life['life_span'] = firm_life['year_max'] - firm_life['year_min'] + 1 # load in ipc info grant = read_csv(f'{output}/grant_info.csv', usecols=['firm_num', 'ipc']) grant = grant.dropna(subset=['firm_num']) grant['firm_num'] = grant['firm_num'].astype('Int64') grant['ipc4'] = grant['ipc'].str.slice(0, 4) # get modal ipc4 info count_ipc = grant.groupby(['firm_num', 'ipc4']).size().rename('count_ipc4') firm_ipc = count_ipc.reset_index( level='firm_num').groupby('firm_num')['count_ipc4'] mode_ipc = firm_ipc.idxmax().rename('mode_ipc4') mode_ipc_count = firm_ipc.max().rename('mode_ipc4_count').astype('Int64') firm_life = firm_life.join(mode_ipc) firm_life = firm_life.join(mode_ipc_count) firm_life['mode_ipc4_frac'] = firm_life['mode_ipc4_count'] / firm_life[ 'tot_pats'] firm_life = firm_life.drop('mode_ipc4_count', axis=1) firm_life.to_csv(f'{output}/firm_life.csv', float_format='%.3f')
def merge_firms(output, columns, base=1000000): print('merging firms') names = read_csv(f'{output}/name.csv') match = read_csv(f'{output}/match.csv') firms = pd.merge(names, match, how='left', on='id') firms['firm_num'] = firms['firm_num'].fillna(firms['id']+base).astype(np.int) firms[['firm_num', 'id']].to_csv(f'{output}/firm.csv', index=False) for tag, (table, id_col, name_col) in columns.items(): src = read_csv(f'{output}/{tag}_match.csv') src = pd.merge(src, firms, on='id') src[[id_col, 'firm_num']].to_csv(f'{output}/{tag}_firm.csv', index=False)
def find_groups(output, thresh=0.85): print('finding matches') def dmetr(name1, name2): max_len = max(len(name1), len(name2)) max_dist = int(ceil(max_len * (1.0 - thresh))) ldist = levenshtein(name1, name2) return (1.0 - float(ldist) / max_len) if (ldist != -1 and max_len != 0) else 0.0 close = [] name_std = {} pairs = read_csv(f'{output}/pair.csv', usecols=['id1', 'id2', 'name1', 'name2']) for i, id1, id2, name1, name2 in pairs.itertuples(): if id1 not in name_std: name_std[id1] = standardize_strong(name1) if id2 not in name_std: name_std[id2] = standardize_strong(name2) n1std = name_std[id1] n2std = name_std[id2] if dmetr(n1std, n2std) > thresh: close.append((id1, id2)) if i > 0 and i % 100_000 == 0: print(f'{i}: {len(close)}')
def patent_stocks(output): print('Constructing patent stocks') # load firm data firmyear_info = read_csv(f'{output}/firmyear_info.csv') firm_info = read_csv( f'{output}/firm_life.csv', usecols=['firm_num', 'year_min', 'year_max', 'life_span']) # make (firm_num, year) index fnum_set = firm_info['firm_num'] year_min = firm_info['year_min'] year_max = firm_info['year_max'] life_span = firm_info['life_span'] all_fnums = np.array(list( chain(*[[fnum] * life for fnum, life in zip(fnum_set, life_span)])), dtype=np.int) all_years = np.array(list( chain(*[range(x, y + 1) for x, y in zip(year_min, year_max)])), dtype=np.int) fy_all = pd.DataFrame({'firm_num': all_fnums, 'year': all_years}) datf_idx = fy_all.merge(firmyear_info, how='left', on=['firm_num', 'year']) int_cols = [ 'n_apply', 'n_grant', 'n_citing', 'n_cited', 'n_self_cited', 'n_source', 'n_dest' ] datf_idx = datf_idx.fillna({c: 0 for c in int_cols}) datf_idx[int_cols] = datf_idx[int_cols].astype(np.int) # merge in overall firm info datf_idx = datf_idx.merge(firm_info[['firm_num', 'year_min']], how='left', on='firm_num') datf_idx['age'] = datf_idx['year'] - datf_idx['year_min'] datf_idx = datf_idx.drop('year_min', axis=1) # aggregate stocks firm_groups = datf_idx.groupby('firm_num') datf_idx['stock'] = firm_groups['n_apply'].cumsum() # write new frame to disk datf_idx.to_csv(f'{output}/firmyear_index.csv', index=False, float_format='%.3f')
def merge_grants(output): print('Merging all grant data') grant = read_csv(f'{output}/grant_grant.csv').set_index('patnum') firm = read_csv(f'{output}/grant_firm.csv').set_index('patnum') cite = read_csv(f'{output}/cite_stats.csv').set_index('patnum') assign = read_csv(f'{output}/assign_stats.csv').set_index('patnum') maint = read_csv(f'{output}/maint.csv').set_index('patnum') grant = grant.join(firm) grant = grant.join(cite) grant = grant.join(assign) grant = grant.join(maint) fill_cols = ['n_cited', 'n_citing', 'n_self_cited', 'n_trans', 'claims'] grant[fill_cols] = grant[fill_cols].fillna(0).astype(np.int) int_cols = ['firm_num', 'last_maint'] grant[int_cols] = grant[int_cols].astype('Int64') grant.drop('abstract', axis=1).to_csv(f'{output}/grant_info.csv') grant[['title', 'abstract']].to_csv(f'{output}/grant_text.csv')
def filter_pairs(output, nshingle=2, k=8, thresh=4): print('filtering pairs') c = Cluster(k=k, thresh=thresh) name_dict = {} names = read_csv(f'{output}/name.csv', usecols=['id', 'name']) for i, id, name in names.itertuples(): words = name.split() shings = list(shingle(name, nshingle)) features = shings + words weights = list(np.linspace(1.0, 0.0, len(shings))) + list(np.linspace(1.0, 0.0, len(words))) c.add(features, weights=weights, label=id) name_dict[id] = name if i > 0 and i % 100_000 == 0: print(f'{i}: {len(c.unions)}')
def generate_names(output, columns): print('generating names') sdict = {} for tag, (table, id_col, name_col) in columns.items(): src = read_csv(f'{output}/{table}.csv', usecols=[id_col, name_col]).dropna() src['name'] = src[name_col].apply(standardize_weak) sdict[tag] = src names = pd.concat([src['name'] for src in sdict.values()], axis=0).drop_duplicates() names = names[names.str.len()>0].reset_index(drop=True) names = names.rename('name').rename_axis('id').reset_index() names.to_csv(f'{output}/name.csv', index=False) for tag, (table, id_col, name_col) in columns.items(): src = pd.merge(sdict[tag], names, how='left', on='name') src[[id_col, 'id']].to_csv(f'{output}/{tag}_match.csv', index=False) print(f'found {len(names)} names')
from tools.tables import read_csv # parse input arguments parser = argparse.ArgumentParser(description='Merge patent citation data.') parser.add_argument('--output', type=str, default='tables', help='directory to operate on') parser.add_argument('--chunk', type=int, default=10000000, help='chunk size for citations') args = parser.parse_args() # load in grant data grants = read_csv(f'{args.output}/grant_firm.csv').set_index('patnum') # match and aggregates cites def aggregate_cites(cites): print(len(cites)) # match citations to firms with patnum cites = cites.rename(columns={'src': 'citer_pnum', 'dst': 'citee_pnum'}) cites = cites.join(grants.add_prefix('citer_'), on='citer_pnum') cites = cites.join(grants.add_prefix('citee_'), on='citee_pnum') cites['self_cite'] = (cites['citer_firm_num'] == cites['citee_firm_num']) # patent level statistics stats = pd.DataFrame({ 'n_cited':
'saint helena': 'sh', 'bangladesh': 'bd', 'french polynesia': 'pf', 'chad': 'td', 'yemen': 'ye', 'syrian arab republic': 'sy', 'nothern mariana islands': 'mp', 'uzbekistan': 'uz', 'rwanda': 'rw', 'guatemala': 'gt', 'new brunswick': 'ca', 'burundi': 'bi' } # eliminate assignments within entities assn = read_csv(f'{args.output}/assign_assign.csv') assn = assn.dropna(subset=['patnum', 'execdate'], axis=0) assn['assignee_state'] = assn['assignee_state'].map(state_map) assn['assignee_country'] = assn['assignee_country'].map(country_map) assn['same'] = assn[['assignor', 'assignee']].apply(lambda x: same_entity(*x), raw=True, axis=1) good = assn[~assn['same']].drop('same', axis=1) good = good.reset_index(drop=True).rename_axis('assignid', axis=0).reset_index() good.to_csv(f'{args.output}/assign_use.csv', index=False) # aggregated assignment stats pat_group = good.groupby('patnum') assign_stats = pd.DataFrame({ 'first_trans': pat_group['execdate'].min(),
def generate_firmyear(output, compustat=False): print('Generating all firm-years') total = [] # patent applications apply = read_csv(f'{output}/apply_apply.csv', usecols=['appnum', 'appdate']) apply_firm = read_csv(f'{output}/apply_firm.csv').set_index('appnum') apply = apply.join(apply_firm, on='appnum', how='inner') apply['appyear'] = apply['appdate'].str.slice(0, 4).astype(np.int) apply_fy = apply.groupby(['firm_num', 'appyear']).size().rename('n_apply') apply_fy = apply_fy.rename_axis(index={'appyear': 'year'}) total.append(apply_fy) # patent grants grant = read_csv( f'{output}/grant_info.csv', usecols=['patnum', 'pubdate', 'n_cited', 'n_citing', 'n_self_cited']) grant_firm = read_csv(f'{output}/grant_firm.csv').set_index('patnum') grant = grant.dropna(subset=['pubdate'], axis=0) grant['pubyear'] = grant['pubdate'].str.slice(0, 4).astype(np.int) grant = grant.join(grant_firm, on='patnum', how='inner') grant_groups = grant.groupby(['firm_num', 'pubyear']) grant_fy = grant_groups[['n_cited', 'n_citing', 'n_self_cited']].sum() grant_fy['n_grant'] = grant_groups.size() grant_fy = grant_fy.rename_axis(index={'pubyear': 'year'}) total.append(grant_fy) # patent assignments assign = read_csv(f'{output}/assign_use.csv', usecols=['assignid', 'execdate']) assignor_firm = read_csv(f'{output}/assignor_firm.csv').set_index( 'assignid') assignee_firm = read_csv(f'{output}/assignee_firm.csv').set_index( 'assignid') assign = assign.join(assignor_firm.add_prefix('assignor_'), on='assignid', how='inner') assign = assign.join(assignee_firm.add_prefix('assignee_'), on='assignid', how='inner') assign['execyear'] = assign['execdate'].str.slice(0, 4).astype(np.int) assignor_fy = assign.groupby(['assignor_firm_num', 'execyear']).size().rename('n_source') assignor_fy = assignor_fy.rename_axis(index={ 'assignor_firm_num': 'firm_num', 'execyear': 'year' }) total.append(assignor_fy) assignee_fy = assign.groupby(['assignee_firm_num', 'execyear']).size().rename('n_dest') assignee_fy = assignee_fy.rename_axis(index={ 'assignee_firm_num': 'firm_num', 'execyear': 'year' }) total.append(assignee_fy) # compustat firms if compustat: compu = read_csv(f'{output}/compustat.csv') compu_firm = read_csv(f'{output}/compustat_firm.csv').set_index( 'compid') compu = compu.join(compu_firm, on='compid', how='inner') compu_fy = compu.groupby(['firm_num', 'year'])[[ 'assets', 'capx', 'cash', 'cogs', 'deprec', 'income', 'employ', 'intan', 'debt', 'revenue', 'sales', 'rnd', 'fcost', 'mktval' ]].sum() ind_info = compu.groupby(['firm_num', 'year'])[['naics', 'sic']].first() compu_fy = compu_fy.join(ind_info) total.append(compu_fy) # comprehensive total = pd.concat(total, axis=1).reset_index() int_cols = [ 'n_apply', 'n_grant', 'n_cited', 'n_citing', 'n_self_cited', 'n_source', 'n_dest' ] total[int_cols] = total[int_cols].astype('Int64') total.to_csv(f'{output}/firmyear_info.csv', index=False, float_format='%.3f')