def gen_patnums(patents): for pat in patents: for doc in pat.findall('document-id'): kind = get_text(doc, 'kind') pnum = get_text(doc, 'doc-number') if not kind.startswith('B'): continue yield pnum
def parse_grants_gen2(elem): pat = copy(default) # top-level section bib = elem.find('subdoc-bibliographic-information') # publication data pub = bib.find('document-id') if pub is not None: pat['pubnum'] = get_text(pub, 'doc-number') pat['pubdate'] = get_text(pub, 'document-date') # application data app = bib.find('domestic-filing-data') if app is not None: pat['appnum'] = get_text(app, 'application-number/doc-number') pat['appdate'] = get_text(app, 'filing-date') pat['appname'] = get_text(bib, 'assignee/organization-name') # title tech = bib.find('technical-information') pat['title'] = get_text(tech, 'title-of-invention') # ipc code ipcsec = tech.find('classification-ipc') pat['ipcver'] = get_text(ipcsec, 'classification-ipc-edition') if ipcsec is not None: ipclist = list(gen2_ipc(ipcsec)) if len(ipclist) > 0: pat['ipc1'] = ipclist[0] pat['ipc2'] = ';'.join(ipclist) # applicant info address = bib.find('correspondence-address/address') if address is not None: pat['city'] = get_text(address, 'city') pat['state'] = get_text(address, 'state') pat['country'] = get_text(address, 'country/country-code') # abstract abst = elem.find('subdoc-abstract') if abst is not None: pat['abstract'] = raw_text(abst, sep=' ') # roll it in return store_patent(pat)
def parse_gen3(fname_in): global i, o, p for (event,elem) in iterparse(fname_in, tag='patent-assignment', events=['end'], recover=True): # top-level section record = elem.find('assignment-record') assignor = elem.find('patent-assignors')[0] assignee = elem.find('patent-assignees')[0] patents = elem.find('patent-properties') # conveyance convey = get_text(record,'conveyance-text') # names assignor_name = get_text(assignor, 'name') assignee_name = get_text(assignee, 'name') # dates exec_sec = assignor.find('execution-date') recd_sec = record.find('recorded-date') exec_date = get_text(exec_sec, 'date') if exec_sec is not None else '' recd_date = get_text(recd_sec, 'date') if recd_sec is not None else '' # location assignee_country = get_text(assignee, 'country-name', default='UNITED STATES') assignee_state = get_text(assignee, 'state') # patent info patnums = list(gen_patnums(patents)) npat = len(patnums) if npat == 0: continue # code names src_type = org_type(assignor_name) dst_type = org_type(assignee_name) ctype = convey_type(convey) # throw out individuals if src_type == ORG_INDV or dst_type == ORG_INDV or ctype == CONV_OTHER: o += 1 continue # output for pn in patnums: chunker.insert(None, pn,exec_date, recd_date, convey, assignor_name, assignee_name, assignee_state, assignee_country) # stats i += 1 p += npat if i % 100 == 0: print('%4d: %40.40s -> %30.30s (%20.20s, %20.20s)' % (npat, assignor_name, assignee_name, assignee_state, assignee_country)) # break if args.limit and i >= args.limit: return False return True
def parse_grants_gen3(elem): pat = copy(default) # top-level section bib = elem.find('us-bibliographic-data-application') pubref = bib.find('publication-reference') appref = bib.find('application-reference') # published patent pubinfo = pubref.find('document-id') pat['pubnum'] = get_text(pubinfo, 'doc-number') pat['pubdate'] = get_text(pubinfo, 'date') # filing date pat['appnum'] = get_text(appref, 'document-id/doc-number') pat['appdate'] = get_text(appref, 'document-id/date') pat['appname'] = get_text(bib, 'assignees/assignee/orgname') # title pat['title'] = get_text(bib, 'invention-title') # ipc code ipcsec = bib.find('classifications-ipcr') if ipcsec is not None: ipclist = list(gen3_ipcr(ipcsec)) pat['ipc1'], pat['ipcver'] = ipclist[0] pat['ipc2'] = ';'.join([i for i, _ in ipclist]) ipcsec = bib.find('classification-ipc') if ipcsec is not None: ipclist = list(gen3_ipc(ipcsec)) pat['ipc1'], pat['ipcver'] = ipclist[0] pat['ipc2'] = ';'.join([i for i, _ in ipclist]) # applicant name and address address = bib.find('parties/applicants/applicant/addressbook/address') if address is not None: pat['city'] = get_text(address, 'city') pat['state'] = get_text(address, 'state') pat['country'] = get_text(address, 'country') # abstract abspar = elem.find('abstract') if abspar is not None: pat['abstract'] = raw_text(abspar, sep=' ') # roll it in return store_patent(pat)
def gen3_ipcr(ipcsec): for ipc in ipcsec.findall('classification-ipcr'): yield ( '%s%s%s%s%s' % ( get_text(ipc, 'section'), get_text(ipc, 'class'), get_text(ipc, 'subclass'), get_text(ipc, 'main-group'), get_text(ipc, 'subgroup') ), get_text(ipc, 'ipc-version-indicator/date') )
def parse_gen3(fname_in): global i, o, p for (event, elem) in iterparse(fname_in, tag='patent-assignment', events=['end'], recover=True): # top-level section record = elem.find('assignment-record') assignor = elem.find('patent-assignors')[0] assignee = elem.find('patent-assignees')[0] patents = elem.find('patent-properties') # conveyance convey = get_text(record, 'conveyance-text') # names assignor_name = get_text(assignor, 'name') assignee_name = get_text(assignee, 'name') # dates exec_sec = assignor.find('execution-date') recd_sec = record.find('recorded-date') exec_date = get_text(exec_sec, 'date') if exec_sec is not None else '' recd_date = get_text(recd_sec, 'date') if recd_sec is not None else '' # location assignee_country = get_text(assignee, 'country-name', default='UNITED STATES') assignee_state = get_text(assignee, 'state') # patent info patnums = list(gen_patnums(patents)) npat = len(patnums) if npat == 0: continue # code names src_type = org_type(assignor_name) dst_type = org_type(assignee_name) ctype = convey_type(convey) # throw out individuals if src_type == ORG_INDV or dst_type == ORG_INDV or ctype == CONV_OTHER: o += 1 continue # output for pn in patnums: chunker.insert(None, pn, exec_date, recd_date, convey, assignor_name, assignee_name, assignee_state, assignee_country) # free memory clear(elem) # stats i += 1 p += npat # logging if i % 1000 == 0: print('%4d: %40.40s -> %30.30s (%20.20s, %20.20s)' % (npat, assignor_name, assignee_name, assignee_state, assignee_country)) # break if args.limit and i >= args.limit: return False return True
def handle_patent(elem): pat = defaultdict(str) pat['gen'] = 3 # top-level section bib = elem.find('us-bibliographic-data-grant') pubref = bib.find('publication-reference') appref = bib.find('application-reference') # published patent pubinfo = pubref.find('document-id') pat['patnum'] = get_text(pubinfo, 'doc-number') pat['grantdate'] = get_text(pubinfo, 'date') # filing date pat['filedate'] = get_text(appref, 'document-id/date') # title pat['title'] = get_text(bib, 'invention-title') # ipc code ipclist = [] ipcsec = bib.find('classifications-ipcr') if ipcsec is not None: for ipc in ipcsec.findall('classification-ipcr'): ipclist.append( ('%s%s%s%3s%s' % (get_text(ipc, 'section'), get_text(ipc, 'class'), get_text(ipc, 'subclass'), get_text( ipc, 'main-group'), get_text(ipc, 'subgroup')), get_text(ipc, 'ipc-version-indicator/date'))) ipcsec = bib.find('classification-ipc') if ipcsec is not None: ipcver = get_text(ipcsec, 'edition') ipc0 = ipcsec.find('main-classification') for ipc in chain([ipc0], ipcsec.findall('further-classification')): itxt = ipc.text itxt = itxt[:4] + itxt[4:7].replace( '0', ' ') + itxt[7:].replace('/', '') ipclist.append((itxt, ipcver)) pat['ipclist'] = ipclist # us class oclsec = bib.find('classification-national') if oclsec is not None: pat['class'] = get_text(oclsec, 'main-classification') # claims pat['claims'] = get_text(bib, 'number-of-claims') # citations refs = bib.find('references-cited') prefix = '' if refs is None: refs = bib.find('us-references-cited') prefix = 'us-' cites = [] if refs is not None: for cite in refs.findall(prefix + 'citation'): pcite = cite.find('patcit') if pcite is not None: docid = pcite.find('document-id') pnum = get_text(docid, 'doc-number') kind = get_text(docid, 'kind') if kind == 'A' or kind.startswith('B'): cites.append(pnum) pat['citlist'] = cites # applicant name and address assignee = bib.find('assignees/assignee/addressbook') if assignee is not None: pat['owner'] = get_text(assignee, 'orgname').upper() address = assignee.find('address') pat['city'] = get_text(address, 'city').upper() pat['state'] = get_text(address, 'state') pat['country'] = get_text(address, 'country') # abstract abspar = elem.find('abstract') if abspar is not None: pat['abstract'] = raw_text(abspar, sep=' ') # roll it in return store_patent(pat)
def handle_patent(elem): pat = defaultdict(str) pat['gen'] = 2 # top-level section bib = elem.find('SDOBI') # published patent pubref = bib.find('B100') pat['patnum'] = get_text(pubref, 'B110/DNUM/PDAT') pat['grantdate'] = get_text(pubref, 'B140/DATE/PDAT') # filing date appref = bib.find('B200') pat['filedate'] = get_text(appref, 'B220/DATE/PDAT') # ipc code patref = bib.find('B500') ipcsec = patref.find('B510') ipcver = get_text(ipcsec, 'B516/PDAT') ipclist = [] ipc1 = get_text(ipcsec, 'B511/PDAT') if ipc1 is not None: ipclist.append((ipc1, ipcver)) for child in ipcsec.findall('B512'): ipc = get_text(child, 'PDAT') ipclist.append((ipc, ipcver)) pat['ipclist'] = ipclist # us class pat['class'] = get_text(patref, 'B520/B521/PDAT') # citations cites = [] refs = patref.find('B560') if refs is not None: for cite in refs.findall('B561'): pcit = get_text(cite, 'PCIT/DOC/DNUM/PDAT') cites.append(pcit) pat['citlist'] = cites # title pat['title'] = get_text(patref, 'B540/STEXT/PDAT') # claims pat['claims'] = get_text(patref, 'B570/B577/PDAT') # applicant name and address ownref = bib.find('B700/B730/B731/PARTY-US') if ownref is not None: pat['owner'] = get_text(ownref, 'NAM/ONM/STEXT/PDAT').upper() address = ownref.find('ADR') if address is not None: pat['city'] = get_text(address, 'CITY/PDAT').upper() pat['state'] = get_text(address, 'STATE/PDAT') pat['country'] = get_text(address, 'CTRY/PDAT', default='US') # abstract abspars = elem.findall('SDOAB/BTEXT/PARA') if len(abspars) > 0: pat['abstract'] = '\n'.join([raw_text(e) for e in abspars]) # roll it in return store_patent(pat)
def handle_patent(elem): pat = defaultdict(str) pat['gen'] = 2 # top-level section bib = elem.find('SDOBI') # published patent pubref = bib.find('B100') pat['patnum'] = get_text(pubref, 'B110/DNUM/PDAT') pat['grantdate'] = get_text(pubref, 'B140/DATE/PDAT') # filing date appref = bib.find('B200') pat['filedate'] = get_text(appref, 'B220/DATE/PDAT') # ipc code patref = bib.find('B500') ipcsec = patref.find('B510') ipcver = get_text(ipcsec, 'B516/PDAT') ipclist = [] ipc1 = get_text(ipcsec, 'B511/PDAT') if ipc1 is not None: ipclist.append((ipc1, ipcver)) for child in ipcsec.findall('B512'): ipc = get_text(child, 'PDAT') ipclist.append((ipc, ipcver)) pat['ipclist'] = ipclist # citations cites = [] refs = patref.find('B560') if refs is not None: for cite in refs.findall('B561'): pcit = get_text(cite, 'PCIT/DOC/DNUM/PDAT') cites.append(pcit) pat['citlist'] = cites # title pat['title'] = get_text(patref, 'B540/STEXT/PDAT') # claims pat['claims'] = get_text(patref, 'B570/B577/PDAT') # applicant name and address ownref = bib.find('B700/B730/B731/PARTY-US') if ownref is not None: pat['owner'] = get_text(ownref, 'NAM/ONM/STEXT/PDAT').upper() address = ownref.find('ADR') if address is not None: pat['state'] = get_text(address, 'STATE/PDAT') pat['country'] = get_text(address, 'CTRY/PDAT', default='US') # abstract abspars = elem.findall('SDOAB/BTEXT/PARA') if len(abspars) > 0: pat['abstract'] = '\n'.join([raw_text(e) for e in abspars]) # roll it in return store_patent(pat)
def handle_patent(elem): pat = defaultdict(str) pat['gen'] = 3 # top-level section bib = elem.find('us-bibliographic-data-grant') pubref = bib.find('publication-reference') appref = bib.find('application-reference') # published patent pubinfo = pubref.find('document-id') pat['patnum'] = get_text(pubinfo, 'doc-number') pat['grantdate'] = get_text(pubinfo, 'date') # filing date pat['filedate'] = get_text(appref, 'document-id/date') # title pat['title'] = get_text(bib, 'invention-title') # ipc code ipclist = [] ipcsec = bib.find('classifications-ipcr') if ipcsec is not None: for ipc in ipcsec.findall('classification-ipcr'): ipclist.append(('%s%s%s%3s%s' % (get_text(ipc, 'section'), get_text(ipc, 'class'), get_text(ipc, 'subclass'), get_text(ipc, 'main-group'), get_text(ipc, 'subgroup')), get_text(ipc, 'ipc-version-indicator/date'))) ipcsec = bib.find('classification-ipc') if ipcsec is not None: ipcver = get_text(ipcsec, 'edition') ipc0 = ipcsec.find('main-classification') for ipc in chain([ipc0], ipcsec.findall('further-classification')): itxt = ipc.text itxt = itxt[:4] + itxt[4:7].replace('0',' ') + itxt[7:].replace('/','') ipclist.append((itxt, ipcver)) pat['ipclist'] = ipclist # claims pat['claims'] = get_text(bib, 'number-of-claims') # citations refs = bib.find('references-cited') prefix = '' if refs is None: refs = bib.find('us-references-cited') prefix = 'us-' cites = [] if refs is not None: for cite in refs.findall(prefix + 'citation'): pcite = cite.find('patcit') if pcite is not None: docid = pcite.find('document-id') pnum = get_text(docid, 'doc-number') kind = get_text(docid, 'kind') if kind == 'A' or kind.startswith('B'): cites.append(pnum) pat['citlist'] = cites # applicant name and address assignee = bib.find('assignees/assignee/addressbook') if assignee is not None: pat['owner'] = get_text(assignee, 'orgname').upper() address = assignee.find('address') pat['state'] = get_text(address, 'state') pat['country'] = get_text(address, 'country') # abstract abspar = elem.find('abstract') if abspar is not None: pat['abstract'] = raw_text(abspar, sep=' ') # roll it in return store_patent(pat)
def gen3_ipc(ipcsec): ipcver = get_text(ipcsec, 'edition') ipc0 = get_text(ipcsec, 'main-classification') yield ipc0, ipcver for ipc in ipcsec.findall('further-classification'): yield (ipc.text or ''), ipcver
def gen2_ipc(ipcsec): ipc0 = ipcsec.find('classification-ipc-primary') if ipc0 is not None: yield get_text(ipc0, 'ipc') for ipc in ipcsec.findall('classification-ipc-secondary'): yield get_text(ipc, 'ipc')