def main(): args = parse_io() df_patent = pd.read_table( args.input, usecols=[ 'patent_id', 'inventor_id', 'inventor_share', 'cbsa_id']) \ .drop_duplicates() \ .drop(columns='inventor_id') \ .groupby(['patent_id','cbsa_id'], as_index=False) \ .agg({ 'inventor_share':'sum'}) \ .rename(columns={'inventor_share':'cbsa_share'}) dir, file = os.path.split(args.output) if not os.path.exists(dir): os.makedirs(dir) df_patent.to_csv(args.output, sep='\t', index=False, compression={ 'method': 'zip', 'archive_name': file.replace('.zip', '') })
def main(): args = parse_io() patent_ids = set( pd.read_table( args.input_list[0], # msa_patent.tsv.zip usecols=[ 'patent_id'], dtype=int) \ .patent_id) \ .union( pd.read_table( args.input_list[1], # msa_citation.tsv.zip usecols=[ 'forward_citation_id'], dtype=int) \ .forward_citation_id) df_cpc = pd.read_table( args.input_list[2], # cpc_current.tsv.zip usecols=[ 'patent_id', 'group_id', 'subgroup_id'], dtype={ 'patent_id':int, 'group_id':str, 'subgroup_id':str}) \ .rename(columns={ 'group_id':'cpc_class'}) df_cpc = df_cpc[df_cpc.patent_id.isin(patent_ids)] df_cpc['subgroup_id'] = df_cpc \ .subgroup_id \ .apply(lambda row: row.split('/')[0]) df_cpc.drop_duplicates(inplace=True) df_cpc = df_cpc[ (df_cpc.subgroup_id.str.len()<8) & \ (~df_cpc.subgroup_id.str.startswith('Y'))] df_cpc = df_cpc \ .value_counts([ 'patent_id', 'cpc_class']) \ .reset_index(name='cpc_class_count') \ .sample(frac=1, random_state=1) dir, file = os.path.split(args.output) if not os.path.exists(dir): os.makedirs(dir) df_cpc.to_csv(args.output, sep='\t', index=False, compression={ 'method': 'zip', 'archive_name': file.replace('.zip', '') })
def main(): args = parse_io() source_url = args.input output_file = args.output output_dir, file_name = os.path.split(output_file) if not os.path.exists(output_dir): os.makedirs(output_dir) download_url(source_url, output_dir, file_name) time.sleep(random.random()*5)
def main(): args = parse_io() df_msa_patent = pd.read_table( args.input_list[0], # msa_patent.tsv.zip usecols=['patent_id'], dtype=np.uint32) df_msa_citation = pd.read_table( args.input_list[1], # msa_citation.tsv.zip dtype=np.uint32) patent_ids = set(df_msa_patent.patent_id) \ .union(df_msa_citation.forward_citation_id) del df_msa_patent df_msa_patent = pd.read_table( args.input_list[2], # patent_info.tsv.zip usecols=[ 'patent_id', 'grant_date', 'appln_date'], dtype={ 'patent_id':np.uint32, 'grant_date':str, 'appln_date':str}) \ .drop_duplicates() \ .query('patent_id in @patent_ids') dir, file = os.path.split(args.output) if not os.path.exists(dir): os.makedirs(dir) df_msa_patent.to_csv( args.output, sep='\t', index=False, compression={ 'method':'zip', 'archive_name':file.replace('.zip','')})
def main(): args = parse_io() df_patent = pd.read_table( args.input, usecols=[ 'patent_id', 'inventor_id', 'inventor_share']) \ .drop_duplicates() dir, file = os.path.split(args.output) if not os.path.exists(dir): os.makedirs(dir) df_patent.to_csv( args.output, sep='\t', index=False, compression={ 'method':'zip', 'archive_name':file.replace('.zip','')})
def main(): args = parse_io() dir, file = os.path.split(args.output) if not os.path.exists(dir): os.makedirs(dir) with open(args.output, 'w') as f_out: for f_in in args.input_list: dir, file = os.path.split(f_in) df = pd.read_table(f_in, dtype=str, nrows=5) readme_table = df \ .to_markdown( index=False, tablefmt='github') f_out.write(f'### {file.split(".")[0]}\n') f_out.write(readme_table) if file in [ 'msa_patent_dates.tsv.zip', 'msa_patent_uspc.tsv.zip', 'msa_patent_quality.tsv.zip' ]: f_out.write( ('\n\nNotes:\n' '* Rename *patent_id* as *forward_citation_id* ' 'to merge this table with the *msa_citation* table.')) if file == 'msa_patent_uspc.tsv.zip': df = pd.read_table(f_in, usecols=['patent_id', 'uspc_class'], dtype={ 'patent_id': int, 'uspc_class': str }) frac_no_uspc = df[df.uspc_class.isna()] \ .patent_id.nunique()/df.patent_id.nunique() f_out.write( (f'\n* {frac_no_uspc:.1%} ' 'of the *patent_id*s have no *uspc_class* ' '(most of which, very old or very recent patents).')) f_out.write('\n\n\n')
def main(): args = parse_io() msa_patents = pd.read_table( args.input_list[0], # msa_patents.tsv.zip usecols=[ 'patent_id'], dtype=str) \ .patent_id.unique() df_patent_citation = pd.read_table( args.input_list[1], # uspatentcitation.tsv.zip usecols=['patent_id', 'citation_id'], dtype=str) # Build the dataframe, filtering only the citations going to utility patents # located into a MSA and coming from a utility patent # Rename to columns so that the dataframe can be merged with the other # dataframes of the MSA-patents project: # - patent_id is the cited patent # - forward_citation_id is the citing patent df_patent_citation = df_patent_citation[ (df_patent_citation.citation_id.isin(msa_patents)) & (df_patent_citation.patent_id.str.isnumeric())] \ .rename(columns={ 'patent_id':'forward_citation_id', 'citation_id':'patent_id'}) dir, file = os.path.split(args.output) if not os.path.exists(dir): os.makedirs(dir) df_patent_citation.to_csv(args.output, sep='\t', index=False, compression={ 'method': 'zip', 'archive_name': file.replace('.zip', '') })
def main(): args = parse_io() df_patent = pd.read_table( args.input_list[0], # patent.tsv.zip usecols=[ 'id'], dtype=str) \ .rename(columns={ 'id':'patent_id'}) df_patent = df_patent[df_patent.patent_id.str.isnumeric()] df_patent_inventor = pd.read_table( args.input_list[1], # patent_inventor.tsv.zip dtype=str) \ .dropna() df_patent = pd.merge(df_patent, df_patent_inventor) del df_patent_inventor df_patent = pd.merge( df_patent, 1 / df_patent \ .groupby('patent_id') \ .agg({ 'inventor_id':'nunique'}) \ .rename(columns={ 'inventor_id':'inventor_share'}), left_on='patent_id', right_index=True, how='left') df_location = pd.read_table( args.input_list[2], # location.tsv.zip usecols=[ 'id', 'latitude', 'longitude'], dtype={ 'id':str, 'latitude':float, 'longitude':float}) \ .rename(columns={ 'id':'location_id'}) df_patent = pd.merge(df_patent, df_location) del df_location geometry = gpd.points_from_xy( df_patent.longitude, df_patent.latitude) df_patent = gpd.GeoDataFrame( df_patent, geometry=geometry, crs='EPSG:4269') df_patent.drop( columns=[ 'location_id', 'latitude', 'longitude'], inplace=True) # M1 = Metropolitan areas df_cbsa = gpd.read_file( # cb_2019_us_cbsa_20m.zip f'zip://{args.input_list[3]}') \ .query('LSAD=="M1"') \ .drop(columns=['LSAD','ALAND','AWATER']) \ .rename(columns={ 'CSAFP':'csa_id', 'CBSAFP':'cbsa_id', 'NAME':'cbsa_label'}) df_patent = gpd.sjoin( df_patent, df_cbsa, op='within') \ .drop(columns='index_right') df_patent = pd.DataFrame(df_patent) del df_cbsa dir, file = os.path.split(args.output) if not os.path.exists(dir): os.makedirs(dir) df_patent.to_csv( args.output, sep='\t', index=False, compression={ 'method':'zip', 'archive_name':file.replace('.zip','')})
def main(): args = parse_io() df_msa_patent_dates = pd.read_table( args.input_list[0], # msa_patent_dates.tsv.zip dtype={ 'patent_id': np.uint32, 'grant_date': str, 'appln_date': str }, parse_dates=['grant_date', 'appln_date']) df_msa_patent_uspc = pd.read_table( args.input_list[1], # msa_patent_uspc.tsv.zip dtype={ 'patent_id': np.uint32, 'uspc_class': 'category' }) df_msa_patent = pd.merge(df_msa_patent_dates, df_msa_patent_uspc, how='outer') del df_msa_patent_dates, df_msa_patent_uspc df_msa_patent['grant_year'] = df_msa_patent.grant_date.dt.year df_msa_patent['appln_year'] = df_msa_patent.appln_date.dt.year df_patent = pd.read_table( args.input_list[2], # patent_info.tsv.zip usecols=[ 'patent_id', 'grant_date', 'appln_date', 'uspc_class', 'num_claims'], dtype={ 'patent_id':np.uint32, 'grant_date':str, 'appln_date':str, 'uspc_class':'category', 'num_claims':float}) \ .drop_duplicates() df_patent['grant_date'] = pd.to_datetime(df_patent.grant_date, errors='coerce') df_patent['appln_date'] = pd.to_datetime(df_patent.appln_date, errors='coerce') df_patent = df_patent[(~df_patent.grant_date.isna()) & (~df_patent.appln_date.isna())] df_patent['grant_year'] = df_patent.grant_date.dt.year df_patent['appln_year'] = df_patent.appln_date.dt.year grant_date_last = df_patent.grant_date.max() df_avg_num_claims_gy = df_patent \ .groupby([ 'grant_year', 'uspc_class']) \ .agg({ 'num_claims':'mean'}) \ .rename(columns={ 'num_claims':'avg_num_claims_gy'}) df_msa_patent = pd.merge(df_msa_patent, df_avg_num_claims_gy, left_on=['grant_year', 'uspc_class'], right_index=True, how='left') df_avg_num_claims_ay = df_patent \ .groupby([ 'appln_year', 'uspc_class']) \ .agg({ 'num_claims':'mean'}) \ .rename(columns={ 'num_claims':'avg_num_claims_ay'}) df_msa_patent = pd.merge(df_msa_patent, df_avg_num_claims_ay, left_on=['appln_year', 'uspc_class'], right_index=True, how='left') subset = df_msa_patent.uspc_class.isna() df_avg_num_claims_gy = df_patent \ .groupby([ 'grant_year']) \ .agg({ 'num_claims':'mean'}) \ .rename(columns={ 'num_claims':'avg_num_claims_gy'}) df_msa_patent = pd.concat([ df_msa_patent[~subset], pd.merge( df_msa_patent[subset] \ .drop(columns='avg_num_claims_gy'), df_avg_num_claims_gy, left_on=['grant_year'], right_index=True, how='left')], sort=True) subset = df_msa_patent.uspc_class.isna() df_avg_num_claims_ay = df_patent \ .groupby([ 'appln_year']) \ .agg({ 'num_claims':'mean'}) \ .rename(columns={ 'num_claims':'avg_num_claims_ay'}) df_msa_patent = pd.concat([ df_msa_patent[~subset], pd.merge( df_msa_patent[subset] \ .drop(columns='avg_num_claims_ay'), df_avg_num_claims_ay, left_on=['appln_year'], right_index=True, how='left')], sort=True) del df_avg_num_claims_gy, df_avg_num_claims_ay, subset df_patent.drop(columns='num_claims', inplace=True) ########################## df_msa_citation = pd.merge(df_msa_citation, df_patent) df_msa_citation = pd.merge( df_msa_citation, df_patent \ [['patent_id','grant_date']] \ .rename(columns={ 'patent_id':'forward_citation_id', 'grant_date':'forward_citation_grant_date'})) # del df_patent df_msa_citation['time_length'] = df_msa_citation \ .forward_citation_grant_date \ .sub(df_msa_citation.grant_date) df_msa_citation = df_msa_citation[ df_msa_citation.time_length.dt.days <= 10 * 365] df_msa_citation_10y = df_msa_citation \ .groupby('patent_id') \ .agg({'forward_citation_id':'nunique'}) \ .rename(columns={'forward_citation_id':'num_citations_10y'}) df_msa_citation = df_msa_citation[ df_msa_citation.time_length.dt.days <= 5 * 365] df_msa_citation_5y = df_msa_citation \ .groupby('patent_id') \ .agg({'forward_citation_id':'nunique'}) \ .rename(columns={'forward_citation_id':'num_citations_5y'}) df_msa_citation = pd.merge(df_msa_citation_5y, df_msa_citation_10y, left_index=True, right_index=True, how='outer') del df_msa_citation_5y, df_msa_citation_10y df_msa_patent = pd.merge(df_msa_patent, df_msa_citation, left_on='patent_id', right_index=True, how='left') del df_msa_citation for years in [5, 10]: col = f'num_citations_{years}y' threshold = grant_date_last - pd.tseries.offsets.Day(years * 365) df_msa_patent[col] = df_msa_patent[col] \ .fillna(0) df_msa_patent.loc[df_msa_patent.grant_date > threshold, col] = np.nan ########################## # CITATIONS df_avg_num_citations_gy = df_patent_citation \ .groupby([ 'grant_year', 'uspc_class']) \ .agg({ 'num_citations_5y':'mean', 'num_citations_10y':'mean'}) \ .rename(columns={ 'num_citations_5y':'avg_num_citations_5y_gy', 'num_citations_10y':'avg_num_citations_10y_gy'}) df_msa_patent = pd.merge(df_msa_patent, df_avg_num_citations_gy, left_on=['grant_year', 'uspc_class'], right_index=True, how='left') df_avg_num_citations_ay = df_patent_citation \ .groupby([ 'appln_year', 'uspc_class']) \ .agg({ 'num_citations_5y':'mean', 'num_citations_10y':'mean'}) \ .rename(columns={ 'num_citations_5y':'avg_num_citations_5y_ay', 'num_citations_10y':'avg_num_citations_10y_ay'}) df_msa_patent = pd.merge(df_msa_patent, df_avg_num_citations_ay, left_on=['appln_year', 'uspc_class'], right_index=True, how='left') subset = df_msa_patent.uspc_class.isna() df_avg_num_citations_gy = df_patent_citation \ .groupby([ 'grant_year']) \ .agg({ 'num_citations_5y':'mean', 'num_citations_10y':'mean'}) \ .rename(columns={ 'num_citations_5y':'avg_num_citations_5y_gy', 'num_citations_10y':'avg_num_citations_10y_gy'}) df_msa_patent = pd.concat([ df_msa_patent[~subset], pd.merge( df_msa_patent[subset] \ .drop(columns=[ 'avg_num_citations_5y_gy', 'avg_num_citations_10y_gy']), df_avg_num_citations_gy, left_on=['grant_year'], right_index=True, how='left')], sort=True) df_avg_num_citations_ay = df_patent_citation \ .groupby([ 'appln_year']) \ .agg({ 'num_citations_5y':'mean', 'num_citations_10y':'mean'}) \ .rename(columns={ 'num_citations_5y':'avg_num_citations_5y_ay', 'num_citations_10y':'avg_num_citations_10y_ay'}) df_msa_patent = pd.concat([ df_msa_patent[~subset], pd.merge( df_msa_patent[subset] \ .drop(columns=[ 'avg_num_citations_5y_ay', 'avg_num_citations_10y_ay']), df_avg_num_citations_ay, left_on=['appln_year'], right_index=True, how='left')], sort=True) ########################## df_msa_patent = df_msa_patent[[ 'patent_id', 'num_claims', 'num_citations_5y', 'num_citations_10y', 'avg_num_claims_gy', 'avg_num_claims_ay', 'avg_num_citations_5y_gy', 'avg_num_citations_10y_gy', 'avg_num_citations_5y_ay', 'avg_num_citations_10y_ay']] \ .drop_duplicates() ########################## dir, file = os.path.split(args.output) if not os.path.exists(dir): os.makedirs(dir) df_msa_patent.to_csv(args.output, sep='\t', index=False, compression={ 'method': 'zip', 'archive_name': file.replace('.zip', '') })
def main(): args = parse_io() df_patent = pd.read_table( args.input_list[0], # patent.tsv.zip usecols=[ 'id', 'date'], dtype={ 'date':str}, converters={ 'id':convert_patent_id}) \ .rename(columns={ 'id':'patent_id', 'date':'grant_date'}) \ .drop_duplicates() \ .query('patent_id!=0') \ .astype({ 'patent_id':np.uint32}) df_application = pd.read_table( args.input_list[1], # application.tsv.zip usecols=[ 'patent_id', 'date', 'num_claims'], dtype={ 'date':str, 'num_claims':float}, converters={ 'patent_id':convert_patent_id}) \ .rename(columns={ 'date':'appln_date'}) \ .drop_duplicates() \ .query('patent_id!=0') \ .astype({ 'patent_id':np.uint32}) df_patent = pd.merge( df_patent, df_application, how='left') del df_application for date_column in ['grant_date', 'appln_date']: df_patent = fix_dates(df_patent, date_column) df_patent[date_column] = pd.to_datetime( df_patent[date_column]) df_patent = df_patent[ (~df_patent.grant_date.isna()) & (~df_patent.appln_date.isna())] df_patent['grant_year'] = df_patent.grant_date.dt.year df_patent['appln_year'] = df_patent.appln_date.dt.year grant_date_last = df_patent.grant_date.max() df_patex = pd.read_csv( args.input_list[2], # application_data.csv.zip usecols=[ 'uspc_class', 'patent_number'], converters={ 'uspc_class':convert_uspc_class, 'patent_number':convert_patent_id}) \ .rename(columns={ 'patent_number':'patent_id'}) \ .drop_duplicates() \ .dropna() \ .query('patent_id!=0 & uspc_class!="XXX"') \ .astype({ 'uspc_class':str, 'patent_id':np.uint32}) df_patex['uspc_class'] = pd.Categorical(df_patex.uspc_class) df_patent = pd.merge( df_patent, df_patex, how='left') del df_patex df_patent_citation = pd.read_table( args.input_list[3], # uspatentcitation.tsv.zip usecols=[ 'patent_id', 'citation_id'], converters={ 'patent_id':convert_patent_id, 'citation_id':convert_patent_id}) \ .rename(columns={ 'patent_id':'forward_citation_id', 'citation_id':'patent_id'}) \ .query('patent_id!=0 & forward_citation_id!=0') \ .astype({ 'patent_id':np.uint32, 'forward_citation_id':np.uint32}) df_patent_citation = pd.merge( df_patent_citation, df_patent) df_patent_citation = pd.merge( df_patent_citation, df_patent \ [['patent_id','grant_date']] \ .rename(columns={ 'patent_id':'forward_citation_id', 'grant_date':'forward_citation_grant_date'})) # del df_patent df_patent_citation['time_length'] = df_patent_citation \ .forward_citation_grant_date \ .sub(df_patent_citation.grant_date) df_patent_citation = df_patent_citation[df_patent_citation.time_length.dt.days<=10*365] df_patent_citation_10y = df_patent_citation \ .groupby('patent_id') \ .agg({'forward_citation_id':'nunique'}) \ .rename(columns={'forward_citation_id':'num_citations_10y'}) df_patent_citation = df_patent_citation[df_patent_citation.time_length.dt.days<=5*365] df_patent_citation_5y = df_patent_citation \ .groupby('patent_id') \ .agg({'forward_citation_id':'nunique'}) \ .rename(columns={'forward_citation_id':'num_citations_5y'}) df_patent_citation = pd.merge( df_patent_citation_5y, df_patent_citation_10y, left_index=True, right_index=True, how='outer') del df_patent_citation_5y, df_patent_citation_10y df_patent_citation = pd.merge( df_patent_citation, df_patent, left_index=True, right_on='patent_id') for years in [5,10]: col = f'num_citations_{years}y' threshold = grant_date_last - pd.tseries.offsets.Day(years*365) df_patent_citation[col] = df_patent_citation[col] \ .fillna(0) df_patent_citation.loc[ df_patent_citation.grant_date > threshold, col] = np.nan dir, file = os.path.split(args.output) if not os.path.exists(dir): os.makedirs(dir) df_patent.to_csv( args.output, sep='\t', index=False, compression={ 'method':'zip', 'archive_name':file.replace('.zip','')})