Ejemplo n.º 1
0
def gen_patnums(patents):
    for pat in patents:
        for doc in pat.findall('document-id'):
            kind = get_text(doc, 'kind')
            pnum = get_text(doc, 'doc-number')
            if not kind.startswith('B'):
                continue
            yield pnum
Ejemplo n.º 2
0
def gen_patnums(patents):
    for pat in patents:
        for doc in pat.findall('document-id'):
            kind = get_text(doc, 'kind')
            pnum = get_text(doc, 'doc-number')
            if not kind.startswith('B'):
                continue
            yield pnum
Ejemplo n.º 3
0
def parse_grants_gen2(elem):
    pat = copy(default)

    # top-level section
    bib = elem.find('subdoc-bibliographic-information')

    # publication data
    pub = bib.find('document-id')
    if pub is not None:
        pat['pubnum'] = get_text(pub, 'doc-number')
        pat['pubdate'] = get_text(pub, 'document-date')

    # application data
    app = bib.find('domestic-filing-data')
    if app is not None:
        pat['appnum'] = get_text(app, 'application-number/doc-number')
        pat['appdate'] = get_text(app, 'filing-date')
    pat['appname'] = get_text(bib, 'assignee/organization-name')

    # title
    tech = bib.find('technical-information')
    pat['title'] = get_text(tech, 'title-of-invention')

    # ipc code
    ipcsec = tech.find('classification-ipc')
    pat['ipcver'] = get_text(ipcsec, 'classification-ipc-edition')
    if ipcsec is not None:
        ipclist = list(gen2_ipc(ipcsec))
        if len(ipclist) > 0:
            pat['ipc1'] = ipclist[0]
            pat['ipc2'] = ';'.join(ipclist)

    # applicant info
    address = bib.find('correspondence-address/address')
    if address is not None:
        pat['city'] = get_text(address, 'city')
        pat['state'] = get_text(address, 'state')
        pat['country'] = get_text(address, 'country/country-code')

    # abstract
    abst = elem.find('subdoc-abstract')
    if abst is not None:
        pat['abstract'] = raw_text(abst, sep=' ')

    # roll it in
    return store_patent(pat)
Ejemplo n.º 4
0
def parse_gen3(fname_in):
    global i, o, p

    for (event,elem) in iterparse(fname_in, tag='patent-assignment', events=['end'], recover=True):
        # top-level section
        record = elem.find('assignment-record')
        assignor = elem.find('patent-assignors')[0]
        assignee = elem.find('patent-assignees')[0]
        patents = elem.find('patent-properties')

        # conveyance
        convey = get_text(record,'conveyance-text')

        # names
        assignor_name = get_text(assignor, 'name')
        assignee_name = get_text(assignee, 'name')

        # dates
        exec_sec = assignor.find('execution-date')
        recd_sec = record.find('recorded-date')

        exec_date = get_text(exec_sec, 'date') if exec_sec is not None else ''
        recd_date = get_text(recd_sec, 'date') if recd_sec is not None else ''

        # location
        assignee_country = get_text(assignee, 'country-name', default='UNITED STATES')
        assignee_state = get_text(assignee, 'state')

        # patent info
        patnums = list(gen_patnums(patents))
        npat = len(patnums)
        if npat == 0:
            continue

        # code names
        src_type = org_type(assignor_name)
        dst_type = org_type(assignee_name)
        ctype = convey_type(convey)

        # throw out individuals
        if src_type == ORG_INDV or dst_type == ORG_INDV or ctype == CONV_OTHER:
            o += 1
            continue

        # output
        for pn in patnums:
            chunker.insert(None, pn,exec_date, recd_date, convey, assignor_name, assignee_name, assignee_state, assignee_country)

        # stats
        i += 1
        p += npat
            
        if i % 100 == 0:
            print('%4d: %40.40s -> %30.30s (%20.20s, %20.20s)' % (npat, assignor_name, assignee_name, assignee_state, assignee_country))
        
        # break
        if args.limit and i >= args.limit:
            return False

    return True
Ejemplo n.º 5
0
def parse_grants_gen3(elem):
    pat = copy(default)

    # top-level section
    bib = elem.find('us-bibliographic-data-application')
    pubref = bib.find('publication-reference')
    appref = bib.find('application-reference')

    # published patent
    pubinfo = pubref.find('document-id')
    pat['pubnum'] = get_text(pubinfo, 'doc-number')
    pat['pubdate'] = get_text(pubinfo, 'date')

    # filing date
    pat['appnum'] = get_text(appref, 'document-id/doc-number')
    pat['appdate'] = get_text(appref, 'document-id/date')
    pat['appname'] = get_text(bib, 'assignees/assignee/orgname')

    # title
    pat['title'] = get_text(bib, 'invention-title')

    # ipc code
    ipcsec = bib.find('classifications-ipcr')
    if ipcsec is not None:
        ipclist = list(gen3_ipcr(ipcsec))
        pat['ipc1'], pat['ipcver'] = ipclist[0]
        pat['ipc2'] = ';'.join([i for i, _ in ipclist])

    ipcsec = bib.find('classification-ipc')
    if ipcsec is not None:
        ipclist = list(gen3_ipc(ipcsec))
        pat['ipc1'], pat['ipcver'] = ipclist[0]
        pat['ipc2'] = ';'.join([i for i, _ in ipclist])

    # applicant name and address
    address = bib.find('parties/applicants/applicant/addressbook/address')
    if address is not None:
        pat['city'] = get_text(address, 'city')
        pat['state'] = get_text(address, 'state')
        pat['country'] = get_text(address, 'country')

    # abstract
    abspar = elem.find('abstract')
    if abspar is not None:
        pat['abstract'] = raw_text(abspar, sep=' ')

    # roll it in
    return store_patent(pat)
Ejemplo n.º 6
0
def gen3_ipcr(ipcsec):
    for ipc in ipcsec.findall('classification-ipcr'):
        yield (
            '%s%s%s%s%s' % (
                get_text(ipc, 'section'),
                get_text(ipc, 'class'),
                get_text(ipc, 'subclass'),
                get_text(ipc, 'main-group'),
                get_text(ipc, 'subgroup')
            ),
            get_text(ipc, 'ipc-version-indicator/date')
        )
Ejemplo n.º 7
0
def parse_gen3(fname_in):
    global i, o, p

    for (event, elem) in iterparse(fname_in,
                                   tag='patent-assignment',
                                   events=['end'],
                                   recover=True):
        # top-level section
        record = elem.find('assignment-record')
        assignor = elem.find('patent-assignors')[0]
        assignee = elem.find('patent-assignees')[0]
        patents = elem.find('patent-properties')

        # conveyance
        convey = get_text(record, 'conveyance-text')

        # names
        assignor_name = get_text(assignor, 'name')
        assignee_name = get_text(assignee, 'name')

        # dates
        exec_sec = assignor.find('execution-date')
        recd_sec = record.find('recorded-date')

        exec_date = get_text(exec_sec, 'date') if exec_sec is not None else ''
        recd_date = get_text(recd_sec, 'date') if recd_sec is not None else ''

        # location
        assignee_country = get_text(assignee,
                                    'country-name',
                                    default='UNITED STATES')
        assignee_state = get_text(assignee, 'state')

        # patent info
        patnums = list(gen_patnums(patents))
        npat = len(patnums)
        if npat == 0:
            continue

        # code names
        src_type = org_type(assignor_name)
        dst_type = org_type(assignee_name)
        ctype = convey_type(convey)

        # throw out individuals
        if src_type == ORG_INDV or dst_type == ORG_INDV or ctype == CONV_OTHER:
            o += 1
            continue

        # output
        for pn in patnums:
            chunker.insert(None, pn, exec_date, recd_date, convey,
                           assignor_name, assignee_name, assignee_state,
                           assignee_country)

        # free memory
        clear(elem)

        # stats
        i += 1
        p += npat

        # logging
        if i % 1000 == 0:
            print('%4d: %40.40s -> %30.30s (%20.20s, %20.20s)' %
                  (npat, assignor_name, assignee_name, assignee_state,
                   assignee_country))

        # break
        if args.limit and i >= args.limit:
            return False

    return True
Ejemplo n.º 8
0
    def handle_patent(elem):
        pat = defaultdict(str)
        pat['gen'] = 3

        # top-level section
        bib = elem.find('us-bibliographic-data-grant')
        pubref = bib.find('publication-reference')
        appref = bib.find('application-reference')

        # published patent
        pubinfo = pubref.find('document-id')
        pat['patnum'] = get_text(pubinfo, 'doc-number')
        pat['grantdate'] = get_text(pubinfo, 'date')

        # filing date
        pat['filedate'] = get_text(appref, 'document-id/date')

        # title
        pat['title'] = get_text(bib, 'invention-title')

        # ipc code
        ipclist = []

        ipcsec = bib.find('classifications-ipcr')
        if ipcsec is not None:
            for ipc in ipcsec.findall('classification-ipcr'):
                ipclist.append(
                    ('%s%s%s%3s%s' %
                     (get_text(ipc, 'section'), get_text(ipc, 'class'),
                      get_text(ipc, 'subclass'), get_text(
                          ipc, 'main-group'), get_text(ipc, 'subgroup')),
                     get_text(ipc, 'ipc-version-indicator/date')))

        ipcsec = bib.find('classification-ipc')
        if ipcsec is not None:
            ipcver = get_text(ipcsec, 'edition')
            ipc0 = ipcsec.find('main-classification')
            for ipc in chain([ipc0], ipcsec.findall('further-classification')):
                itxt = ipc.text
                itxt = itxt[:4] + itxt[4:7].replace(
                    '0', ' ') + itxt[7:].replace('/', '')
                ipclist.append((itxt, ipcver))

        pat['ipclist'] = ipclist

        # us class
        oclsec = bib.find('classification-national')
        if oclsec is not None:
            pat['class'] = get_text(oclsec, 'main-classification')

        # claims
        pat['claims'] = get_text(bib, 'number-of-claims')

        # citations
        refs = bib.find('references-cited')
        prefix = ''
        if refs is None:
            refs = bib.find('us-references-cited')
            prefix = 'us-'

        cites = []
        if refs is not None:
            for cite in refs.findall(prefix + 'citation'):
                pcite = cite.find('patcit')
                if pcite is not None:
                    docid = pcite.find('document-id')
                    pnum = get_text(docid, 'doc-number')
                    kind = get_text(docid, 'kind')
                    if kind == 'A' or kind.startswith('B'):
                        cites.append(pnum)
        pat['citlist'] = cites

        # applicant name and address
        assignee = bib.find('assignees/assignee/addressbook')
        if assignee is not None:
            pat['owner'] = get_text(assignee, 'orgname').upper()
            address = assignee.find('address')
            pat['city'] = get_text(address, 'city').upper()
            pat['state'] = get_text(address, 'state')
            pat['country'] = get_text(address, 'country')

        # abstract
        abspar = elem.find('abstract')
        if abspar is not None:
            pat['abstract'] = raw_text(abspar, sep=' ')

        # roll it in
        return store_patent(pat)
Ejemplo n.º 9
0
    def handle_patent(elem):
        pat = defaultdict(str)
        pat['gen'] = 2

        # top-level section
        bib = elem.find('SDOBI')

        # published patent
        pubref = bib.find('B100')
        pat['patnum'] = get_text(pubref, 'B110/DNUM/PDAT')
        pat['grantdate'] = get_text(pubref, 'B140/DATE/PDAT')

        # filing date
        appref = bib.find('B200')
        pat['filedate'] = get_text(appref, 'B220/DATE/PDAT')

        # ipc code
        patref = bib.find('B500')
        ipcsec = patref.find('B510')
        ipcver = get_text(ipcsec, 'B516/PDAT')
        ipclist = []
        ipc1 = get_text(ipcsec, 'B511/PDAT')
        if ipc1 is not None:
            ipclist.append((ipc1, ipcver))
        for child in ipcsec.findall('B512'):
            ipc = get_text(child, 'PDAT')
            ipclist.append((ipc, ipcver))
        pat['ipclist'] = ipclist

        # us class
        pat['class'] = get_text(patref, 'B520/B521/PDAT')

        # citations
        cites = []
        refs = patref.find('B560')
        if refs is not None:
            for cite in refs.findall('B561'):
                pcit = get_text(cite, 'PCIT/DOC/DNUM/PDAT')
                cites.append(pcit)
        pat['citlist'] = cites

        # title
        pat['title'] = get_text(patref, 'B540/STEXT/PDAT')

        # claims
        pat['claims'] = get_text(patref, 'B570/B577/PDAT')

        # applicant name and address
        ownref = bib.find('B700/B730/B731/PARTY-US')
        if ownref is not None:
            pat['owner'] = get_text(ownref, 'NAM/ONM/STEXT/PDAT').upper()
            address = ownref.find('ADR')
            if address is not None:
                pat['city'] = get_text(address, 'CITY/PDAT').upper()
                pat['state'] = get_text(address, 'STATE/PDAT')
                pat['country'] = get_text(address, 'CTRY/PDAT', default='US')

        # abstract
        abspars = elem.findall('SDOAB/BTEXT/PARA')
        if len(abspars) > 0:
            pat['abstract'] = '\n'.join([raw_text(e) for e in abspars])

        # roll it in
        return store_patent(pat)
Ejemplo n.º 10
0
    def handle_patent(elem):
        pat = defaultdict(str)
        pat['gen'] = 2

        # top-level section
        bib = elem.find('SDOBI')

        # published patent
        pubref = bib.find('B100')
        pat['patnum'] = get_text(pubref, 'B110/DNUM/PDAT')
        pat['grantdate'] = get_text(pubref, 'B140/DATE/PDAT')

        # filing date
        appref = bib.find('B200')
        pat['filedate'] = get_text(appref, 'B220/DATE/PDAT')

        # ipc code
        patref = bib.find('B500')
        ipcsec = patref.find('B510')
        ipcver = get_text(ipcsec, 'B516/PDAT')
        ipclist = []
        ipc1 = get_text(ipcsec, 'B511/PDAT')
        if ipc1 is not None:
            ipclist.append((ipc1, ipcver))
        for child in ipcsec.findall('B512'):
            ipc = get_text(child, 'PDAT')
            ipclist.append((ipc, ipcver))
        pat['ipclist'] = ipclist

        # citations
        cites = []
        refs = patref.find('B560')
        if refs is not None:
            for cite in refs.findall('B561'):
                pcit = get_text(cite, 'PCIT/DOC/DNUM/PDAT')
                cites.append(pcit)
        pat['citlist'] = cites

        # title
        pat['title'] = get_text(patref, 'B540/STEXT/PDAT')

        # claims
        pat['claims'] = get_text(patref, 'B570/B577/PDAT')

        # applicant name and address
        ownref = bib.find('B700/B730/B731/PARTY-US')
        if ownref is not None:
            pat['owner'] = get_text(ownref, 'NAM/ONM/STEXT/PDAT').upper()
            address = ownref.find('ADR')
            if address is not None:
                pat['state'] = get_text(address, 'STATE/PDAT')
                pat['country'] = get_text(address, 'CTRY/PDAT', default='US')

        # abstract
        abspars = elem.findall('SDOAB/BTEXT/PARA')
        if len(abspars) > 0:
            pat['abstract'] = '\n'.join([raw_text(e) for e in abspars])

        # roll it in
        return store_patent(pat)
Ejemplo n.º 11
0
    def handle_patent(elem):
        pat = defaultdict(str)
        pat['gen'] = 3

        # top-level section
        bib = elem.find('us-bibliographic-data-grant')
        pubref = bib.find('publication-reference')
        appref = bib.find('application-reference')

        # published patent
        pubinfo = pubref.find('document-id')
        pat['patnum'] = get_text(pubinfo, 'doc-number')
        pat['grantdate'] = get_text(pubinfo, 'date')

        # filing date
        pat['filedate'] = get_text(appref, 'document-id/date')

        # title
        pat['title'] = get_text(bib, 'invention-title')

        # ipc code
        ipclist = []

        ipcsec = bib.find('classifications-ipcr')
        if ipcsec is not None:
            for ipc in ipcsec.findall('classification-ipcr'):
                ipclist.append(('%s%s%s%3s%s' % (get_text(ipc, 'section'),
                                                 get_text(ipc, 'class'),
                                                 get_text(ipc, 'subclass'),
                                                 get_text(ipc, 'main-group'),
                                                 get_text(ipc, 'subgroup')),
                                get_text(ipc, 'ipc-version-indicator/date')))

        ipcsec = bib.find('classification-ipc')
        if ipcsec is not None:
            ipcver = get_text(ipcsec, 'edition')
            ipc0 = ipcsec.find('main-classification')
            for ipc in chain([ipc0], ipcsec.findall('further-classification')):
                itxt = ipc.text
                itxt = itxt[:4] + itxt[4:7].replace('0',' ') + itxt[7:].replace('/','')
                ipclist.append((itxt, ipcver))

        pat['ipclist'] = ipclist

        # claims
        pat['claims'] = get_text(bib, 'number-of-claims')

        # citations
        refs = bib.find('references-cited')
        prefix = ''
        if refs is None:
            refs = bib.find('us-references-cited')
            prefix = 'us-'

        cites = []
        if refs is not None:
            for cite in refs.findall(prefix + 'citation'):
                pcite = cite.find('patcit')
                if pcite is not None:
                    docid = pcite.find('document-id')
                    pnum = get_text(docid, 'doc-number')
                    kind = get_text(docid, 'kind')
                    if kind == 'A' or kind.startswith('B'):
                        cites.append(pnum)
        pat['citlist'] = cites

        # applicant name and address
        assignee = bib.find('assignees/assignee/addressbook')
        if assignee is not None:
            pat['owner'] = get_text(assignee, 'orgname').upper()
            address = assignee.find('address')
            pat['state'] = get_text(address, 'state')
            pat['country'] = get_text(address, 'country')

        # abstract
        abspar = elem.find('abstract')
        if abspar is not None:
            pat['abstract'] = raw_text(abspar, sep=' ')

        # roll it in
        return store_patent(pat)
Ejemplo n.º 12
0
def gen3_ipc(ipcsec):
    ipcver = get_text(ipcsec, 'edition')
    ipc0 = get_text(ipcsec, 'main-classification')
    yield ipc0, ipcver
    for ipc in ipcsec.findall('further-classification'):
        yield (ipc.text or ''), ipcver
Ejemplo n.º 13
0
def gen2_ipc(ipcsec):
    ipc0 = ipcsec.find('classification-ipc-primary')
    if ipc0 is not None:
        yield get_text(ipc0, 'ipc')
    for ipc in ipcsec.findall('classification-ipc-secondary'):
        yield get_text(ipc, 'ipc')