Example #1
0
def firm_statistics(output):
    print('Finding firm statistics')

    # firm history statistics
    firmyear = read_csv(f'{output}/firmyear_info.csv',
                        usecols=['firm_num', 'year', 'n_grant'])
    firm_groups = firmyear.groupby('firm_num')
    firm_life = pd.DataFrame({
        'year_min': firm_groups['year'].min(),
        'year_max': firm_groups['year'].max(),
        'tot_pats': firm_groups['n_grant'].sum()
    })
    firm_life['tot_pats'] = firm_life['tot_pats'].fillna(0).astype(np.int)
    firm_life['life_span'] = firm_life['year_max'] - firm_life['year_min'] + 1

    # load in ipc info
    grant = read_csv(f'{output}/grant_info.csv', usecols=['firm_num', 'ipc'])
    grant = grant.dropna(subset=['firm_num'])
    grant['firm_num'] = grant['firm_num'].astype('Int64')
    grant['ipc4'] = grant['ipc'].str.slice(0, 4)

    # get modal ipc4 info
    count_ipc = grant.groupby(['firm_num', 'ipc4']).size().rename('count_ipc4')
    firm_ipc = count_ipc.reset_index(
        level='firm_num').groupby('firm_num')['count_ipc4']
    mode_ipc = firm_ipc.idxmax().rename('mode_ipc4')
    mode_ipc_count = firm_ipc.max().rename('mode_ipc4_count').astype('Int64')
    firm_life = firm_life.join(mode_ipc)
    firm_life = firm_life.join(mode_ipc_count)
    firm_life['mode_ipc4_frac'] = firm_life['mode_ipc4_count'] / firm_life[
        'tot_pats']
    firm_life = firm_life.drop('mode_ipc4_count', axis=1)

    firm_life.to_csv(f'{output}/firm_life.csv', float_format='%.3f')
Example #2
0
def merge_firms(output, columns, base=1000000):
    print('merging firms')

    names = read_csv(f'{output}/name.csv')
    match = read_csv(f'{output}/match.csv')
    firms = pd.merge(names, match, how='left', on='id')
    firms['firm_num'] = firms['firm_num'].fillna(firms['id']+base).astype(np.int)
    firms[['firm_num', 'id']].to_csv(f'{output}/firm.csv', index=False)

    for tag, (table, id_col, name_col) in columns.items():
        src = read_csv(f'{output}/{tag}_match.csv')
        src = pd.merge(src, firms, on='id')
        src[[id_col, 'firm_num']].to_csv(f'{output}/{tag}_firm.csv', index=False)
Example #3
0
def find_groups(output, thresh=0.85):
    print('finding matches')

    def dmetr(name1, name2):
        max_len = max(len(name1), len(name2))
        max_dist = int(ceil(max_len * (1.0 - thresh)))
        ldist = levenshtein(name1, name2)
        return (1.0 - float(ldist) / max_len) if (ldist != -1
                                                  and max_len != 0) else 0.0

    close = []
    name_std = {}

    pairs = read_csv(f'{output}/pair.csv',
                     usecols=['id1', 'id2', 'name1', 'name2'])
    for i, id1, id2, name1, name2 in pairs.itertuples():
        if id1 not in name_std:
            name_std[id1] = standardize_strong(name1)
        if id2 not in name_std:
            name_std[id2] = standardize_strong(name2)

        n1std = name_std[id1]
        n2std = name_std[id2]

        if dmetr(n1std, n2std) > thresh:
            close.append((id1, id2))

        if i > 0 and i % 100_000 == 0:
            print(f'{i}: {len(close)}')
Example #4
0
def patent_stocks(output):
    print('Constructing patent stocks')

    # load firm data
    firmyear_info = read_csv(f'{output}/firmyear_info.csv')
    firm_info = read_csv(
        f'{output}/firm_life.csv',
        usecols=['firm_num', 'year_min', 'year_max', 'life_span'])

    # make (firm_num, year) index
    fnum_set = firm_info['firm_num']
    year_min = firm_info['year_min']
    year_max = firm_info['year_max']
    life_span = firm_info['life_span']
    all_fnums = np.array(list(
        chain(*[[fnum] * life for fnum, life in zip(fnum_set, life_span)])),
                         dtype=np.int)
    all_years = np.array(list(
        chain(*[range(x, y + 1) for x, y in zip(year_min, year_max)])),
                         dtype=np.int)
    fy_all = pd.DataFrame({'firm_num': all_fnums, 'year': all_years})

    datf_idx = fy_all.merge(firmyear_info, how='left', on=['firm_num', 'year'])
    int_cols = [
        'n_apply', 'n_grant', 'n_citing', 'n_cited', 'n_self_cited',
        'n_source', 'n_dest'
    ]
    datf_idx = datf_idx.fillna({c: 0 for c in int_cols})
    datf_idx[int_cols] = datf_idx[int_cols].astype(np.int)

    # merge in overall firm info
    datf_idx = datf_idx.merge(firm_info[['firm_num', 'year_min']],
                              how='left',
                              on='firm_num')
    datf_idx['age'] = datf_idx['year'] - datf_idx['year_min']
    datf_idx = datf_idx.drop('year_min', axis=1)

    # aggregate stocks
    firm_groups = datf_idx.groupby('firm_num')
    datf_idx['stock'] = firm_groups['n_apply'].cumsum()

    # write new frame to disk
    datf_idx.to_csv(f'{output}/firmyear_index.csv',
                    index=False,
                    float_format='%.3f')
Example #5
0
def merge_grants(output):
    print('Merging all grant data')

    grant = read_csv(f'{output}/grant_grant.csv').set_index('patnum')
    firm = read_csv(f'{output}/grant_firm.csv').set_index('patnum')
    cite = read_csv(f'{output}/cite_stats.csv').set_index('patnum')
    assign = read_csv(f'{output}/assign_stats.csv').set_index('patnum')
    maint = read_csv(f'{output}/maint.csv').set_index('patnum')

    grant = grant.join(firm)
    grant = grant.join(cite)
    grant = grant.join(assign)
    grant = grant.join(maint)

    fill_cols = ['n_cited', 'n_citing', 'n_self_cited', 'n_trans', 'claims']
    grant[fill_cols] = grant[fill_cols].fillna(0).astype(np.int)

    int_cols = ['firm_num', 'last_maint']
    grant[int_cols] = grant[int_cols].astype('Int64')

    grant.drop('abstract', axis=1).to_csv(f'{output}/grant_info.csv')
    grant[['title', 'abstract']].to_csv(f'{output}/grant_text.csv')
Example #6
0
def filter_pairs(output, nshingle=2, k=8, thresh=4):
    print('filtering pairs')

    c = Cluster(k=k, thresh=thresh)
    name_dict = {}

    names = read_csv(f'{output}/name.csv', usecols=['id', 'name'])
    for i, id, name in names.itertuples():
        words = name.split()
        shings = list(shingle(name, nshingle))

        features = shings + words
        weights = list(np.linspace(1.0, 0.0, len(shings))) + list(np.linspace(1.0, 0.0, len(words)))

        c.add(features, weights=weights, label=id)
        name_dict[id] = name

        if i > 0 and i % 100_000 == 0:
            print(f'{i}: {len(c.unions)}')
Example #7
0
def generate_names(output, columns):
    print('generating names')

    sdict = {}
    for tag, (table, id_col, name_col) in columns.items():
        src = read_csv(f'{output}/{table}.csv', usecols=[id_col, name_col]).dropna()
        src['name'] = src[name_col].apply(standardize_weak)
        sdict[tag] = src

    names = pd.concat([src['name'] for src in sdict.values()], axis=0).drop_duplicates()
    names = names[names.str.len()>0].reset_index(drop=True)
    names = names.rename('name').rename_axis('id').reset_index()
    names.to_csv(f'{output}/name.csv', index=False)

    for tag, (table, id_col, name_col) in columns.items():
        src = pd.merge(sdict[tag], names, how='left', on='name')
        src[[id_col, 'id']].to_csv(f'{output}/{tag}_match.csv', index=False)

    print(f'found {len(names)} names')
Example #8
0
from tools.tables import read_csv

# parse input arguments
parser = argparse.ArgumentParser(description='Merge patent citation data.')
parser.add_argument('--output',
                    type=str,
                    default='tables',
                    help='directory to operate on')
parser.add_argument('--chunk',
                    type=int,
                    default=10000000,
                    help='chunk size for citations')
args = parser.parse_args()

# load in grant data
grants = read_csv(f'{args.output}/grant_firm.csv').set_index('patnum')


# match and aggregates cites
def aggregate_cites(cites):
    print(len(cites))

    # match citations to firms with patnum
    cites = cites.rename(columns={'src': 'citer_pnum', 'dst': 'citee_pnum'})
    cites = cites.join(grants.add_prefix('citer_'), on='citer_pnum')
    cites = cites.join(grants.add_prefix('citee_'), on='citee_pnum')
    cites['self_cite'] = (cites['citer_firm_num'] == cites['citee_firm_num'])

    # patent level statistics
    stats = pd.DataFrame({
        'n_cited':
Example #9
0
    'saint helena': 'sh',
    'bangladesh': 'bd',
    'french polynesia': 'pf',
    'chad': 'td',
    'yemen': 'ye',
    'syrian arab republic': 'sy',
    'nothern mariana islands': 'mp',
    'uzbekistan': 'uz',
    'rwanda': 'rw',
    'guatemala': 'gt',
    'new brunswick': 'ca',
    'burundi': 'bi'
}

# eliminate assignments within entities
assn = read_csv(f'{args.output}/assign_assign.csv')
assn = assn.dropna(subset=['patnum', 'execdate'], axis=0)
assn['assignee_state'] = assn['assignee_state'].map(state_map)
assn['assignee_country'] = assn['assignee_country'].map(country_map)
assn['same'] = assn[['assignor', 'assignee']].apply(lambda x: same_entity(*x),
                                                    raw=True,
                                                    axis=1)
good = assn[~assn['same']].drop('same', axis=1)
good = good.reset_index(drop=True).rename_axis('assignid',
                                               axis=0).reset_index()
good.to_csv(f'{args.output}/assign_use.csv', index=False)

# aggregated assignment stats
pat_group = good.groupby('patnum')
assign_stats = pd.DataFrame({
    'first_trans': pat_group['execdate'].min(),
Example #10
0
def generate_firmyear(output, compustat=False):
    print('Generating all firm-years')

    total = []

    # patent applications
    apply = read_csv(f'{output}/apply_apply.csv',
                     usecols=['appnum', 'appdate'])
    apply_firm = read_csv(f'{output}/apply_firm.csv').set_index('appnum')
    apply = apply.join(apply_firm, on='appnum', how='inner')
    apply['appyear'] = apply['appdate'].str.slice(0, 4).astype(np.int)

    apply_fy = apply.groupby(['firm_num', 'appyear']).size().rename('n_apply')
    apply_fy = apply_fy.rename_axis(index={'appyear': 'year'})
    total.append(apply_fy)

    # patent grants
    grant = read_csv(
        f'{output}/grant_info.csv',
        usecols=['patnum', 'pubdate', 'n_cited', 'n_citing', 'n_self_cited'])
    grant_firm = read_csv(f'{output}/grant_firm.csv').set_index('patnum')
    grant = grant.dropna(subset=['pubdate'], axis=0)
    grant['pubyear'] = grant['pubdate'].str.slice(0, 4).astype(np.int)
    grant = grant.join(grant_firm, on='patnum', how='inner')

    grant_groups = grant.groupby(['firm_num', 'pubyear'])
    grant_fy = grant_groups[['n_cited', 'n_citing', 'n_self_cited']].sum()
    grant_fy['n_grant'] = grant_groups.size()
    grant_fy = grant_fy.rename_axis(index={'pubyear': 'year'})
    total.append(grant_fy)

    # patent assignments
    assign = read_csv(f'{output}/assign_use.csv',
                      usecols=['assignid', 'execdate'])
    assignor_firm = read_csv(f'{output}/assignor_firm.csv').set_index(
        'assignid')
    assignee_firm = read_csv(f'{output}/assignee_firm.csv').set_index(
        'assignid')
    assign = assign.join(assignor_firm.add_prefix('assignor_'),
                         on='assignid',
                         how='inner')
    assign = assign.join(assignee_firm.add_prefix('assignee_'),
                         on='assignid',
                         how='inner')
    assign['execyear'] = assign['execdate'].str.slice(0, 4).astype(np.int)

    assignor_fy = assign.groupby(['assignor_firm_num',
                                  'execyear']).size().rename('n_source')
    assignor_fy = assignor_fy.rename_axis(index={
        'assignor_firm_num': 'firm_num',
        'execyear': 'year'
    })
    total.append(assignor_fy)

    assignee_fy = assign.groupby(['assignee_firm_num',
                                  'execyear']).size().rename('n_dest')
    assignee_fy = assignee_fy.rename_axis(index={
        'assignee_firm_num': 'firm_num',
        'execyear': 'year'
    })
    total.append(assignee_fy)

    # compustat firms
    if compustat:
        compu = read_csv(f'{output}/compustat.csv')
        compu_firm = read_csv(f'{output}/compustat_firm.csv').set_index(
            'compid')
        compu = compu.join(compu_firm, on='compid', how='inner')

        compu_fy = compu.groupby(['firm_num', 'year'])[[
            'assets', 'capx', 'cash', 'cogs', 'deprec', 'income', 'employ',
            'intan', 'debt', 'revenue', 'sales', 'rnd', 'fcost', 'mktval'
        ]].sum()
        ind_info = compu.groupby(['firm_num', 'year'])[['naics',
                                                        'sic']].first()
        compu_fy = compu_fy.join(ind_info)
        total.append(compu_fy)

    # comprehensive
    total = pd.concat(total, axis=1).reset_index()
    int_cols = [
        'n_apply', 'n_grant', 'n_cited', 'n_citing', 'n_self_cited',
        'n_source', 'n_dest'
    ]
    total[int_cols] = total[int_cols].astype('Int64')

    total.to_csv(f'{output}/firmyear_info.csv',
                 index=False,
                 float_format='%.3f')