type=str, default=None, help='database file to store to') parser.add_argument('--limit', type=int, help='only parse n patents') args = parser.parse_args() # connect to patent db con = sqlite3.connect(args.db) cur = con.cursor() cur.execute( 'create table if not exists assign (assignid integer primary key, patnum int, execdate text, recdate text, conveyance text, assignor text, assignee text, assignee_state text, assignee_country text)' ) cur.execute( 'create unique index if not exists idx_assign on assign (patnum,execdate,assignor,assignee)' ) chunker = ChunkInserter(con, table='assign') def gen_patnums(patents): for pat in patents: for doc in pat.findall('document-id'): kind = get_text(doc, 'kind') pnum = get_text(doc, 'doc-number') if not kind.startswith('B'): continue yield pnum # parseahol i = 0 o = 0
parser.add_argument('--db', type=str, default=None, help='database file to store to') args = parser.parse_args() # open database con = sqlite3.connect(args.db) cur = con.cursor() # create table cur.execute('drop table if exists assign_use') cur.execute( 'create table assign_use (assignid integer primary key, patnum int, execdate text, recdate text, conveyance text, assignor text, assignee text, assignee_state text, assignee_country text)' ) chunker = ChunkInserter(con, table='assign_use') match_num = 0 rnum = 0 for row in cur.execute('select * from assign'): (assignee, assignor) = (row[5], row[6]) assignor_toks = name_standardize_strong(assignor) assignee_toks = name_standardize_strong(assignee) word_match = 0 for tok in assignor_toks: if tok in assignee_toks: word_match += 1 word_match /= max(1.0, 0.5 * (len(assignor_toks) + len(assignee_toks)))
# database setup con = sqlite3.connect(args.db) cur = con.cursor() cur.execute( 'create table if not exists patent (patnum int, filedate text, grantdate text, class text, ipc text, ipcver text, city text, state text, country text, owner text, claims int, title text, abstract text, gen int)' ) cur.execute('create unique index if not exists idx_patnum on patent (patnum)') cur.execute( 'create table if not exists ipc (patnum int, code text, version text)') cur.execute('create unique index if not exists ipc_pair on ipc (patnum,code)') cur.execute('create index if not exists ipc_patnum on ipc (patnum)') cur.execute('create index if not exists ipc_code on ipc (code)') cur.execute('create table if not exists cite (src int, dst int)') cur.execute('create unique index if not exists cite_pair on cite (src,dst)') pat_chunker = ChunkInserter(con, table='patent') ipc_chunker = ChunkInserter(con, table='ipc') cit_chunker = ChunkInserter(con, table='cite') # fields fields = [ 'patnum', # Patent number 'filedate', # Application date 'grantdate', # Publication date 'class', # US patent classification 'ipc', # IPC codes 'ipcver', # IPC version info 'city', # Assignee city 'state', # State code 'country', # Assignee country 'owner', # Assignee name
parser.add_argument('--db', type=str, default=None, help='database file to store to') parser.add_argument('--limit', type=int, help='only parse n patents') args = parser.parse_args() # database setup con = sqlite3.connect(args.db) cur = con.cursor() cur.execute('create table if not exists patent (patnum int, filedate text, grantdate text, ipc text, ipcver text, state text, country text, owner text, claims int, title text, abstract text, gen int)') cur.execute('create unique index if not exists idx_patnum on patent (patnum)') cur.execute('create table if not exists ipc (patnum int, code text, version text)') cur.execute('create unique index if not exists ipc_pair on ipc (patnum,code)') cur.execute('create index if not exists ipc_patnum on ipc (patnum)') cur.execute('create index if not exists ipc_code on ipc (code)') cur.execute('create table if not exists cite (src int, dst int)') cur.execute('create unique index if not exists cite_pair on cite (src,dst)') pat_chunker = ChunkInserter(con, table='patent') ipc_chunker = ChunkInserter(con, table='ipc') cit_chunker = ChunkInserter(con, table='cite') # fields fields = [ 'patnum', # Patent number 'filedate', # Application date 'grantdate', # Publication date 'ipc', # IPC codes 'ipcver', # IPC version info 'state', # Province code 'country', # Application Country 'owner', # Applicant name 'claims', # Independent claim 'title', # Title
# MAIN SECTION # parse input arguments parser = argparse.ArgumentParser(description='USPTO patent parser.') parser.add_argument('target', type=str, nargs='*', help='path or directory of file(s) to parse') parser.add_argument('--db', type=str, default=None, help='database file to store to') parser.add_argument('--limit', type=int, help='only parse n patents') args = parser.parse_args() # connect to patent db con = sqlite3.connect(args.db) cur = con.cursor() cur.execute('create table if not exists assign (assignid integer primary key, patnum int, execdate text, recdate text, conveyance text, assignor text, assignee text, assignee_state text, assignee_country text)') cur.execute('create unique index if not exists idx_assign on assign (patnum,execdate)') chunker = ChunkInserter(con, table='assign') def gen_patnums(patents): for pat in patents: for doc in pat.findall('document-id'): kind = get_text(doc, 'kind') pnum = get_text(doc, 'doc-number') if not kind.startswith('B'): continue yield pnum # parseahol i = 0 o = 0 p = 0 def parse_gen3(fname_in):
pp.feed('</root>\n') return parse_all() # parse input arguments parser = argparse.ArgumentParser(description='patent application parser') parser.add_argument('target', type=str, nargs='*', help='path of file to parse') parser.add_argument('--db', type=str, default=None, help='database file to store to') parser.add_argument('--output', type=str, default=100000, help='how often to output summary') args = parser.parse_args() # database setup con = sqlite3.connect(args.db) cur = con.cursor() cur.execute('create table if not exists apply (%s)' % sig) cur.execute('create unique index if not exists idx_appnum on apply (appnum)') chunker = ChunkInserter(con, table='apply') # fields fields = [ 'appnum', # Patent number 'filedate', # Application date 'grantdate', # Publication date 'class', # US patent classification 'ipc', # IPC codes 'ipcver', # IPC version info 'city', # Assignee city 'state', # State code 'country', # Assignee country 'owner', # Assignee name 'claims', # Independent claim 'title', # Title