Exemple #1
0
def main():
    args = parse_io()

    df_patent = pd.read_table(
        args.input,
        usecols=[
            'patent_id',
            'inventor_id',
            'inventor_share',
            'cbsa_id']) \
        .drop_duplicates() \
        .drop(columns='inventor_id') \
        .groupby(['patent_id','cbsa_id'], as_index=False) \
        .agg({
            'inventor_share':'sum'}) \
        .rename(columns={'inventor_share':'cbsa_share'})

    dir, file = os.path.split(args.output)
    if not os.path.exists(dir):
        os.makedirs(dir)

    df_patent.to_csv(args.output,
                     sep='\t',
                     index=False,
                     compression={
                         'method': 'zip',
                         'archive_name': file.replace('.zip', '')
                     })
Exemple #2
0
def main():
    args = parse_io()

    patent_ids = set(
            pd.read_table(
                args.input_list[0], # msa_patent.tsv.zip
                usecols=[
                    'patent_id'],
                dtype=int) \
                .patent_id) \
        .union(
            pd.read_table(
                args.input_list[1], # msa_citation.tsv.zip
                usecols=[
                    'forward_citation_id'],
                dtype=int) \
                .forward_citation_id)

    df_cpc = pd.read_table(
        args.input_list[2], # cpc_current.tsv.zip
        usecols=[
            'patent_id',
            'group_id',
            'subgroup_id'],
        dtype={
            'patent_id':int,
            'group_id':str,
            'subgroup_id':str}) \
        .rename(columns={
            'group_id':'cpc_class'})

    df_cpc = df_cpc[df_cpc.patent_id.isin(patent_ids)]

    df_cpc['subgroup_id'] = df_cpc \
        .subgroup_id \
        .apply(lambda row: row.split('/')[0])
    df_cpc.drop_duplicates(inplace=True)

    df_cpc = df_cpc[
        (df_cpc.subgroup_id.str.len()<8) & \
        (~df_cpc.subgroup_id.str.startswith('Y'))]

    df_cpc = df_cpc \
        .value_counts([
            'patent_id',
            'cpc_class']) \
        .reset_index(name='cpc_class_count') \
        .sample(frac=1, random_state=1)

    dir, file = os.path.split(args.output)
    if not os.path.exists(dir):
        os.makedirs(dir)

    df_cpc.to_csv(args.output,
                  sep='\t',
                  index=False,
                  compression={
                      'method': 'zip',
                      'archive_name': file.replace('.zip', '')
                  })
Exemple #3
0
def main():
    args = parse_io()
    
    source_url = args.input
    output_file = args.output
    output_dir, file_name = os.path.split(output_file)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    download_url(source_url, output_dir, file_name)
    time.sleep(random.random()*5)
Exemple #4
0
def main():
    args = parse_io()

    df_msa_patent = pd.read_table(
        args.input_list[0], # msa_patent.tsv.zip
        usecols=['patent_id'],
        dtype=np.uint32)
    
    df_msa_citation = pd.read_table(
        args.input_list[1], # msa_citation.tsv.zip
        dtype=np.uint32)

    patent_ids = set(df_msa_patent.patent_id) \
        .union(df_msa_citation.forward_citation_id)
    
    del df_msa_patent

    df_msa_patent = pd.read_table(
        args.input_list[2], # patent_info.tsv.zip
        usecols=[
            'patent_id',
            'grant_date',
            'appln_date'],
        dtype={
            'patent_id':np.uint32,
            'grant_date':str,
            'appln_date':str}) \
        .drop_duplicates() \
        .query('patent_id in @patent_ids')

    dir, file = os.path.split(args.output)
    if not os.path.exists(dir):
        os.makedirs(dir)

    df_msa_patent.to_csv(
        args.output, 
        sep='\t', 
        index=False, 
        compression={
            'method':'zip',
            'archive_name':file.replace('.zip','')})
Exemple #5
0
def main():
    args = parse_io()

    df_patent = pd.read_table(
        args.input,
        usecols=[
            'patent_id', 
            'inventor_id', 
            'inventor_share']) \
        .drop_duplicates()

    dir, file = os.path.split(args.output)
    if not os.path.exists(dir):
        os.makedirs(dir)

    df_patent.to_csv(
        args.output, 
        sep='\t', 
        index=False, 
        compression={
            'method':'zip',
            'archive_name':file.replace('.zip','')})
Exemple #6
0
def main():
    args = parse_io()

    dir, file = os.path.split(args.output)
    if not os.path.exists(dir):
        os.makedirs(dir)

    with open(args.output, 'w') as f_out:
        for f_in in args.input_list:
            dir, file = os.path.split(f_in)
            df = pd.read_table(f_in, dtype=str, nrows=5)
            readme_table = df \
                .to_markdown(
                    index=False,
                    tablefmt='github')
            f_out.write(f'### {file.split(".")[0]}\n')
            f_out.write(readme_table)
            if file in [
                    'msa_patent_dates.tsv.zip', 'msa_patent_uspc.tsv.zip',
                    'msa_patent_quality.tsv.zip'
            ]:
                f_out.write(
                    ('\n\nNotes:\n'
                     '* Rename *patent_id* as *forward_citation_id* '
                     'to merge this table with the *msa_citation* table.'))
            if file == 'msa_patent_uspc.tsv.zip':
                df = pd.read_table(f_in,
                                   usecols=['patent_id', 'uspc_class'],
                                   dtype={
                                       'patent_id': int,
                                       'uspc_class': str
                                   })
                frac_no_uspc = df[df.uspc_class.isna()] \
                    .patent_id.nunique()/df.patent_id.nunique()
                f_out.write(
                    (f'\n* {frac_no_uspc:.1%} '
                     'of the *patent_id*s have no *uspc_class* '
                     '(most of which, very old or very recent patents).'))
            f_out.write('\n\n\n')
Exemple #7
0
def main():
    args = parse_io()

    msa_patents = pd.read_table(
        args.input_list[0], # msa_patents.tsv.zip
        usecols=[
            'patent_id'],
        dtype=str) \
        .patent_id.unique()

    df_patent_citation = pd.read_table(
        args.input_list[1],  # uspatentcitation.tsv.zip
        usecols=['patent_id', 'citation_id'],
        dtype=str)

    # Build the dataframe, filtering only the citations going to utility patents
    #  located into a MSA and coming from a utility patent
    # Rename to columns so that the dataframe can be merged with the other
    #  dataframes of the MSA-patents project:
    #   - patent_id is the cited patent
    #   - forward_citation_id is the citing patent
    df_patent_citation = df_patent_citation[
        (df_patent_citation.citation_id.isin(msa_patents)) &
        (df_patent_citation.patent_id.str.isnumeric())] \
        .rename(columns={
            'patent_id':'forward_citation_id',
            'citation_id':'patent_id'})

    dir, file = os.path.split(args.output)
    if not os.path.exists(dir):
        os.makedirs(dir)

    df_patent_citation.to_csv(args.output,
                              sep='\t',
                              index=False,
                              compression={
                                  'method': 'zip',
                                  'archive_name': file.replace('.zip', '')
                              })
Exemple #8
0
def main():
    args = parse_io()

    df_patent = pd.read_table(
        args.input_list[0], # patent.tsv.zip
        usecols=[
            'id'],
        dtype=str) \
        .rename(columns={
            'id':'patent_id'})
    df_patent = df_patent[df_patent.patent_id.str.isnumeric()]

    df_patent_inventor = pd.read_table(
        args.input_list[1], # patent_inventor.tsv.zip
        dtype=str) \
        .dropna()

    df_patent = pd.merge(df_patent, df_patent_inventor)
    del df_patent_inventor

    df_patent = pd.merge(
        df_patent, 
        1 / df_patent \
            .groupby('patent_id') \
            .agg({
                'inventor_id':'nunique'}) \
            .rename(columns={
                'inventor_id':'inventor_share'}),
        left_on='patent_id', right_index=True, 
        how='left')

    df_location = pd.read_table(
        args.input_list[2], # location.tsv.zip
        usecols=[
            'id',
            'latitude',
            'longitude'],
        dtype={
            'id':str,
            'latitude':float,
            'longitude':float}) \
        .rename(columns={
            'id':'location_id'})

    df_patent = pd.merge(df_patent, df_location)
    del df_location

    geometry = gpd.points_from_xy(
        df_patent.longitude, df_patent.latitude)
    df_patent = gpd.GeoDataFrame(
        df_patent, geometry=geometry, crs='EPSG:4269')

    df_patent.drop(
        columns=[
            'location_id',
            'latitude',
            'longitude'], 
        inplace=True)

    # M1 = Metropolitan areas
    df_cbsa = gpd.read_file( # cb_2019_us_cbsa_20m.zip
        f'zip://{args.input_list[3]}') \
        .query('LSAD=="M1"') \
        .drop(columns=['LSAD','ALAND','AWATER']) \
        .rename(columns={
            'CSAFP':'csa_id',
            'CBSAFP':'cbsa_id',
            'NAME':'cbsa_label'})

    df_patent = gpd.sjoin(
        df_patent, df_cbsa, 
        op='within') \
        .drop(columns='index_right')
    df_patent = pd.DataFrame(df_patent)
    del df_cbsa

    dir, file = os.path.split(args.output)
    if not os.path.exists(dir):
        os.makedirs(dir)
    df_patent.to_csv(
        args.output, 
        sep='\t', 
        index=False, 
        compression={
            'method':'zip',
            'archive_name':file.replace('.zip','')})
Exemple #9
0
def main():
    args = parse_io()

    df_msa_patent_dates = pd.read_table(
        args.input_list[0],  # msa_patent_dates.tsv.zip
        dtype={
            'patent_id': np.uint32,
            'grant_date': str,
            'appln_date': str
        },
        parse_dates=['grant_date', 'appln_date'])

    df_msa_patent_uspc = pd.read_table(
        args.input_list[1],  # msa_patent_uspc.tsv.zip
        dtype={
            'patent_id': np.uint32,
            'uspc_class': 'category'
        })

    df_msa_patent = pd.merge(df_msa_patent_dates,
                             df_msa_patent_uspc,
                             how='outer')
    del df_msa_patent_dates, df_msa_patent_uspc

    df_msa_patent['grant_year'] = df_msa_patent.grant_date.dt.year
    df_msa_patent['appln_year'] = df_msa_patent.appln_date.dt.year

    df_patent = pd.read_table(
        args.input_list[2], # patent_info.tsv.zip
        usecols=[
            'patent_id',
            'grant_date',
            'appln_date',
            'uspc_class',
            'num_claims'],
        dtype={
            'patent_id':np.uint32,
            'grant_date':str,
            'appln_date':str,
            'uspc_class':'category',
            'num_claims':float}) \
        .drop_duplicates()

    df_patent['grant_date'] = pd.to_datetime(df_patent.grant_date,
                                             errors='coerce')
    df_patent['appln_date'] = pd.to_datetime(df_patent.appln_date,
                                             errors='coerce')

    df_patent = df_patent[(~df_patent.grant_date.isna())
                          & (~df_patent.appln_date.isna())]

    df_patent['grant_year'] = df_patent.grant_date.dt.year
    df_patent['appln_year'] = df_patent.appln_date.dt.year

    grant_date_last = df_patent.grant_date.max()

    df_avg_num_claims_gy = df_patent \
        .groupby([
            'grant_year',
            'uspc_class']) \
        .agg({
            'num_claims':'mean'}) \
        .rename(columns={
            'num_claims':'avg_num_claims_gy'})

    df_msa_patent = pd.merge(df_msa_patent,
                             df_avg_num_claims_gy,
                             left_on=['grant_year', 'uspc_class'],
                             right_index=True,
                             how='left')

    df_avg_num_claims_ay = df_patent \
        .groupby([
            'appln_year',
            'uspc_class']) \
        .agg({
            'num_claims':'mean'}) \
        .rename(columns={
            'num_claims':'avg_num_claims_ay'})

    df_msa_patent = pd.merge(df_msa_patent,
                             df_avg_num_claims_ay,
                             left_on=['appln_year', 'uspc_class'],
                             right_index=True,
                             how='left')

    subset = df_msa_patent.uspc_class.isna()

    df_avg_num_claims_gy = df_patent \
        .groupby([
            'grant_year']) \
        .agg({
            'num_claims':'mean'}) \
        .rename(columns={
            'num_claims':'avg_num_claims_gy'})

    df_msa_patent = pd.concat([
        df_msa_patent[~subset],
        pd.merge(
            df_msa_patent[subset] \
                .drop(columns='avg_num_claims_gy'),
            df_avg_num_claims_gy,
            left_on=['grant_year'], right_index=True,
            how='left')],
        sort=True)

    subset = df_msa_patent.uspc_class.isna()

    df_avg_num_claims_ay = df_patent \
        .groupby([
            'appln_year']) \
        .agg({
            'num_claims':'mean'}) \
        .rename(columns={
            'num_claims':'avg_num_claims_ay'})

    df_msa_patent = pd.concat([
        df_msa_patent[~subset],
        pd.merge(
            df_msa_patent[subset] \
                .drop(columns='avg_num_claims_ay'),
            df_avg_num_claims_ay,
            left_on=['appln_year'], right_index=True,
            how='left')],
        sort=True)

    del df_avg_num_claims_gy, df_avg_num_claims_ay, subset

    df_patent.drop(columns='num_claims', inplace=True)

    ##########################

    df_msa_citation = pd.merge(df_msa_citation, df_patent)
    df_msa_citation = pd.merge(
        df_msa_citation, df_patent \
            [['patent_id','grant_date']] \
            .rename(columns={
                'patent_id':'forward_citation_id',
                'grant_date':'forward_citation_grant_date'}))
    # del df_patent

    df_msa_citation['time_length'] = df_msa_citation \
        .forward_citation_grant_date \
        .sub(df_msa_citation.grant_date)

    df_msa_citation = df_msa_citation[
        df_msa_citation.time_length.dt.days <= 10 * 365]

    df_msa_citation_10y = df_msa_citation \
        .groupby('patent_id') \
        .agg({'forward_citation_id':'nunique'}) \
        .rename(columns={'forward_citation_id':'num_citations_10y'})

    df_msa_citation = df_msa_citation[
        df_msa_citation.time_length.dt.days <= 5 * 365]

    df_msa_citation_5y = df_msa_citation \
        .groupby('patent_id') \
        .agg({'forward_citation_id':'nunique'}) \
        .rename(columns={'forward_citation_id':'num_citations_5y'})

    df_msa_citation = pd.merge(df_msa_citation_5y,
                               df_msa_citation_10y,
                               left_index=True,
                               right_index=True,
                               how='outer')
    del df_msa_citation_5y, df_msa_citation_10y

    df_msa_patent = pd.merge(df_msa_patent,
                             df_msa_citation,
                             left_on='patent_id',
                             right_index=True,
                             how='left')
    del df_msa_citation

    for years in [5, 10]:
        col = f'num_citations_{years}y'
        threshold = grant_date_last - pd.tseries.offsets.Day(years * 365)
        df_msa_patent[col] = df_msa_patent[col] \
            .fillna(0)
        df_msa_patent.loc[df_msa_patent.grant_date > threshold, col] = np.nan

    ##########################

    # CITATIONS

    df_avg_num_citations_gy = df_patent_citation \
        .groupby([
            'grant_year',
            'uspc_class']) \
        .agg({
            'num_citations_5y':'mean',
            'num_citations_10y':'mean'}) \
        .rename(columns={
            'num_citations_5y':'avg_num_citations_5y_gy',
            'num_citations_10y':'avg_num_citations_10y_gy'})

    df_msa_patent = pd.merge(df_msa_patent,
                             df_avg_num_citations_gy,
                             left_on=['grant_year', 'uspc_class'],
                             right_index=True,
                             how='left')

    df_avg_num_citations_ay = df_patent_citation \
        .groupby([
            'appln_year',
            'uspc_class']) \
        .agg({
            'num_citations_5y':'mean',
            'num_citations_10y':'mean'}) \
        .rename(columns={
            'num_citations_5y':'avg_num_citations_5y_ay',
            'num_citations_10y':'avg_num_citations_10y_ay'})

    df_msa_patent = pd.merge(df_msa_patent,
                             df_avg_num_citations_ay,
                             left_on=['appln_year', 'uspc_class'],
                             right_index=True,
                             how='left')

    subset = df_msa_patent.uspc_class.isna()

    df_avg_num_citations_gy = df_patent_citation \
        .groupby([
            'grant_year']) \
        .agg({
            'num_citations_5y':'mean',
            'num_citations_10y':'mean'}) \
        .rename(columns={
            'num_citations_5y':'avg_num_citations_5y_gy',
            'num_citations_10y':'avg_num_citations_10y_gy'})

    df_msa_patent = pd.concat([
        df_msa_patent[~subset],
        pd.merge(
            df_msa_patent[subset] \
                .drop(columns=[
                    'avg_num_citations_5y_gy',
                    'avg_num_citations_10y_gy']),
            df_avg_num_citations_gy,
            left_on=['grant_year'], right_index=True,
            how='left')],
        sort=True)

    df_avg_num_citations_ay = df_patent_citation \
        .groupby([
            'appln_year']) \
        .agg({
            'num_citations_5y':'mean',
            'num_citations_10y':'mean'}) \
        .rename(columns={
            'num_citations_5y':'avg_num_citations_5y_ay',
            'num_citations_10y':'avg_num_citations_10y_ay'})

    df_msa_patent = pd.concat([
        df_msa_patent[~subset],
        pd.merge(
            df_msa_patent[subset] \
                .drop(columns=[
                    'avg_num_citations_5y_ay',
                    'avg_num_citations_10y_ay']),
            df_avg_num_citations_ay,
            left_on=['appln_year'], right_index=True,
            how='left')],
        sort=True)

    ##########################

    df_msa_patent = df_msa_patent[[
            'patent_id',
            'num_claims',
            'num_citations_5y',
            'num_citations_10y',
            'avg_num_claims_gy',
            'avg_num_claims_ay',
            'avg_num_citations_5y_gy',
            'avg_num_citations_10y_gy',
            'avg_num_citations_5y_ay',
            'avg_num_citations_10y_ay']] \
        .drop_duplicates()

    ##########################

    dir, file = os.path.split(args.output)
    if not os.path.exists(dir):
        os.makedirs(dir)

    df_msa_patent.to_csv(args.output,
                         sep='\t',
                         index=False,
                         compression={
                             'method': 'zip',
                             'archive_name': file.replace('.zip', '')
                         })
Exemple #10
0
def main():
    args = parse_io()

    df_patent = pd.read_table(
        args.input_list[0], # patent.tsv.zip
        usecols=[
            'id',
            'date'],
        dtype={
            'date':str},
        converters={
            'id':convert_patent_id}) \
        .rename(columns={
            'id':'patent_id',
            'date':'grant_date'}) \
        .drop_duplicates() \
        .query('patent_id!=0') \
        .astype({
            'patent_id':np.uint32})

    df_application = pd.read_table(
        args.input_list[1], # application.tsv.zip
        usecols=[
            'patent_id',
            'date',
            'num_claims'],
        dtype={
            'date':str,
            'num_claims':float},
        converters={
            'patent_id':convert_patent_id}) \
        .rename(columns={
            'date':'appln_date'}) \
        .drop_duplicates() \
        .query('patent_id!=0') \
        .astype({
            'patent_id':np.uint32})

    df_patent = pd.merge(
        df_patent, df_application, 
        how='left')
    del df_application

    for date_column in ['grant_date', 'appln_date']:
        df_patent = fix_dates(df_patent, date_column)
        df_patent[date_column] = pd.to_datetime(
            df_patent[date_column])

    df_patent = df_patent[
        (~df_patent.grant_date.isna()) & 
        (~df_patent.appln_date.isna())]

    df_patent['grant_year'] = df_patent.grant_date.dt.year
    df_patent['appln_year'] = df_patent.appln_date.dt.year

    grant_date_last = df_patent.grant_date.max()

    df_patex = pd.read_csv(
        args.input_list[2], # application_data.csv.zip
        usecols=[
            'uspc_class', 
            'patent_number'],
        converters={
            'uspc_class':convert_uspc_class,
            'patent_number':convert_patent_id}) \
        .rename(columns={
            'patent_number':'patent_id'}) \
        .drop_duplicates() \
        .dropna() \
        .query('patent_id!=0 & uspc_class!="XXX"') \
        .astype({
            'uspc_class':str,
            'patent_id':np.uint32})

    df_patex['uspc_class'] = pd.Categorical(df_patex.uspc_class)

    df_patent = pd.merge(
        df_patent, df_patex, 
        how='left')
    del df_patex

    df_patent_citation = pd.read_table(
        args.input_list[3], # uspatentcitation.tsv.zip
        usecols=[
            'patent_id',
            'citation_id'], 
        converters={
            'patent_id':convert_patent_id,
            'citation_id':convert_patent_id}) \
        .rename(columns={
            'patent_id':'forward_citation_id',
            'citation_id':'patent_id'}) \
        .query('patent_id!=0 & forward_citation_id!=0') \
        .astype({
            'patent_id':np.uint32,
            'forward_citation_id':np.uint32})

    df_patent_citation = pd.merge(
        df_patent_citation, df_patent)
    df_patent_citation = pd.merge(
        df_patent_citation, df_patent \
            [['patent_id','grant_date']] \
            .rename(columns={
                'patent_id':'forward_citation_id',
                'grant_date':'forward_citation_grant_date'}))
    # del df_patent

    df_patent_citation['time_length'] = df_patent_citation \
        .forward_citation_grant_date \
        .sub(df_patent_citation.grant_date)

    df_patent_citation = df_patent_citation[df_patent_citation.time_length.dt.days<=10*365]

    df_patent_citation_10y = df_patent_citation \
        .groupby('patent_id') \
        .agg({'forward_citation_id':'nunique'}) \
        .rename(columns={'forward_citation_id':'num_citations_10y'})

    df_patent_citation = df_patent_citation[df_patent_citation.time_length.dt.days<=5*365]

    df_patent_citation_5y = df_patent_citation \
        .groupby('patent_id') \
        .agg({'forward_citation_id':'nunique'}) \
       .rename(columns={'forward_citation_id':'num_citations_5y'})

    df_patent_citation = pd.merge(
        df_patent_citation_5y, df_patent_citation_10y,
        left_index=True, right_index=True,
        how='outer')
    del df_patent_citation_5y, df_patent_citation_10y

    df_patent_citation = pd.merge(
        df_patent_citation, df_patent,
        left_index=True, right_on='patent_id')

    for years in [5,10]:
        col = f'num_citations_{years}y'
        threshold = grant_date_last - pd.tseries.offsets.Day(years*365)
        df_patent_citation[col] = df_patent_citation[col] \
            .fillna(0)
        df_patent_citation.loc[
            df_patent_citation.grant_date > threshold,
            col] = np.nan

    dir, file = os.path.split(args.output)
    if not os.path.exists(dir):
        os.makedirs(dir)

    df_patent.to_csv(
        args.output, 
        sep='\t', 
        index=False, 
        compression={
            'method':'zip',
            'archive_name':file.replace('.zip','')})