def __init__(self, xml_string, filename, is_string=False): xh = xml_driver.XMLHandler() parser = xml_driver.make_parser() parser.setContentHandler(xh) parser.setFeature(xml_driver.handler.feature_external_ges, False) l = xml.sax.xmlreader.Locator() xh.setDocumentLocator(l) if is_string: parser.parse(StringIO(xml_string)) else: parser.parse(xml_string) self.attributes = ['pat','app','assignee_list','patent','inventor_list','lawyer_list', 'us_relation_list','us_classifications','ipcr_classifications', 'citation_list','claims'] self.xml = xh.root.us_patent_grant self.xml_string = xml_string self.country = self.xml.publication_reference.contents_of('country', upper=False)[0] self.patent = xml_util.normalize_document_identifier(self.xml.publication_reference.contents_of('doc_number')[0]) self.kind = self.xml.publication_reference.contents_of('kind')[0] self.date_grant = self.xml.publication_reference.contents_of('date')[0] if self.xml.application_reference: self.pat_type = self.xml.application_reference[0].get_attribute('appl-type', upper=False) else: self.pat_type = None self.date_app = self.xml.application_reference.contents_of('date')[0] self.country_app = self.xml.application_reference.contents_of('country')[0] self.patent_app = self.xml.application_reference.contents_of('doc_number')[0] self.code_app = self.xml.contents_of('us_application_series_code')[0] self.clm_num = self.xml.contents_of('number_of_claims')[0] self.abstract = h.unescape(xh.root.us_patent_grant.abstract.contents_of('p', '', as_string=True, upper=False)) self.invention_title = h.unescape(self._invention_title()) self.filename = re.search('ipg.*$',filename,re.DOTALL).group() self.pat = { "id": self.patent, "type": self.pat_type, "number": self.patent, "country": self.country, "date": self._fix_date(self.date_grant), "abstract": self.abstract, "title": self.invention_title, "kind": self.kind, "num_claims": self.clm_num, "filename": self.filename } self.app = { "type": self.code_app, "number": self.patent_app, "country": self.country_app, "date": self._fix_date(self.date_app) } self.app["id"] = str(self.app["date"])[:4] + "/" + self.app["number"]
def __init__(self, xml_string, is_string=False): xh = xml_driver.XMLHandler() parser = xml_driver.make_parser() parser.setContentHandler(xh) parser.setFeature(xml_driver.handler.feature_external_ges, False) l = xml.sax.xmlreader.Locator() xh.setDocumentLocator(l) if is_string: parser.parse(StringIO(xml_string)) else: parser.parse(xml_string) self.attributes = [ 'app', 'application', 'assignee_list', 'inventor_list', 'us_classifications', 'claims' ] self.xml = xh.root.patent_application_publication if filter(lambda x: not isinstance(x, list), self.xml.contents_of('country_code')): self.country = filter(lambda x: not isinstance(x, list), self.xml.contents_of('country_code'))[0] else: self.country = '' self.application = xml_util.normalize_document_identifier( self.xml.application_number.contents_of('doc_number')[0]) self.kind = self.xml.document_id.contents_of('kind_code')[0] self.pat_type = None self.date_app = self.xml.domestic_filing_data.contents_of( 'filing_date')[0] self.clm_num = len(self.xml.subdoc_claims.claim) self.abstract = self.xml.subdoc_abstract.contents_of('paragraph', '', as_string=True, upper=False) self.invention_title = self._invention_title() self.app = { "id": self.application, "type": self.pat_type, "number": self.application[2:] + '/' + self.application[2:], "country": self.country, "date": self._fix_date(self.date_app), "abstract": self.abstract, "title": self.invention_title, "kind": self.kind, "num_claims": self.clm_num } self.app["id"] = str(self.date_app)[:4] + '/' + self.application
def __init__(self, xml_string, is_string=False): xh = xml_driver.XMLHandler() parser = xml_driver.make_parser() parser.setContentHandler(xh) parser.setFeature(xml_driver.handler.feature_external_ges, False) l = xml.sax.xmlreader.Locator() xh.setDocumentLocator(l) if is_string: parser.parse(StringIO(xml_string)) else: parser.parse(xml_string) self.attributes = [ 'app', 'application', 'assignee_list', 'inventor_list', 'us_classifications', 'claims' ] self.xml = xh.root.us_patent_application self.country = self.xml.application_reference.contents_of( 'country', upper=False)[0] self.application = xml_util.normalize_document_identifier( self.xml.application_reference.contents_of('doc_number')[0]) self.kind = self.xml.publication_reference.contents_of('kind')[0] self.date_app = self.xml.application_reference.contents_of('date')[0] if self.xml.application_reference: self.pat_type = self.xml.application_reference[0].get_attribute( 'appl-type', upper=False) else: self.pat_type = None self.clm_num = len(self.xml.claims.claim) self.abstract = self.xml.abstract.contents_of('p', '', as_string=True, upper=False) self.invention_title = self._invention_title() self.app = { "id": self.application, "type": self.pat_type, "number": self.application[2:] + '/' + self.application[2:], "country": self.country, "date": self._fix_date(self.date_app), "abstract": self.abstract, "title": self.invention_title, "kind": self.kind, "num_claims": self.clm_num } self.app["id"] = str(self.date_app)[:4] + '/' + self.application
def __init__(self, xml_string, is_string=False): xml_string = self._description_patch(xml_string) xml_string = self._claims_patch(xml_string) xml_string = self._abstract_patch(xml_string) xh = xml_driver.XMLHandler() self.xh = xh parser = xml_driver.make_parser() parser.setContentHandler(xh) parser.setFeature(xml_driver.handler.feature_external_ges, False) l = xml.sax.xmlreader.Locator() xh.setDocumentLocator(l) if is_string: parser.parse(StringIO(xml_string)) else: parser.parse(xml_string) self.attributes = ['pat','app','description','assignee_list','patent','inventor_list','lawyer_list', 'us_relation_list','us_classifications','ipcr_classifications', 'citation_list','claims'] self.xml = xh.root.us_patent_grant self.country = self.xml.publication_reference.contents_of('country', upper=False)[0] self.patent = xml_util.normalize_document_identifier(self.xml.publication_reference.contents_of('doc_number')[0]) self.kind = self.xml.publication_reference.contents_of('kind')[0] self.date_grant = self.xml.publication_reference.contents_of('date')[0] if self.xml.application_reference: self.pat_type = self.xml.application_reference[0].get_attribute('appl-type', upper=False) else: self.pat_type = None self.date_app = self.xml.application_reference.contents_of('date')[0] self.country_app = self.xml.application_reference.contents_of('country')[0] self.patent_app = self.xml.application_reference.contents_of('doc_number')[0] self.code_app = self.xml.contents_of('us_application_series_code')[0] self.clm_num = self.xml.contents_of('number_of_claims')[0] self.abstract = xh.root.us_patent_grant.contents_of('abstract', '', as_string=True, upper=False) self.briefsummarydescription = xh.root.us_patent_grant.contents_of('brief_summary_description', '', as_string=True, upper=False, clean_text=False) self.briefdescriptiondrawings = xh.root.us_patent_grant.contents_of('brief_description_drawings', '', as_string=True, upper=False, clean_text=False) self.detaileddescription = xh.root.us_patent_grant.contents_of('detailed_description', '', as_string=True, upper=False, clean_text=False) self.otherpatentrelations = xh.root.us_patent_grant.contents_of('other_patent_relations', '', as_string=True, upper=False, clean_text=False) self.invention_title = self._invention_title() self.pat = { "id": self.patent, "type": self.pat_type, "number": self.patent, "country": self.country, "date": self._fix_date(self.date_grant), "abstract": self.abstract, "title": self.invention_title, "kind": self.kind, "num_claims": self.clm_num } self.app = { "type": self.code_app, "number": self.patent_app, "country": self.country_app, "date": self._fix_date(self.date_app) } self.app["id"] = str(self.app["date"])[:4] + "/" + self.app["number"] self.description = { "id": self.patent, "briefsummarydescription": self.briefsummarydescription, "briefdescriptiondrawings": self.briefdescriptiondrawings, "detaileddescription": self.detaileddescription, "otherpatentrelations": self.otherpatentrelations }
from cStringIO import StringIO from datetime import datetime from unidecode import unidecode from handler import Patobj, PatentHandler import re import uuid import xml.sax import xml_util import xml_driver xml_string = 'ipg050104.xml' xh = xml_driver.XMLHandler() parser = xml_driver.make_parser() parser.setContentHandler(xh) parser.setFeature(xml_driver.handler.feature_external_ges, False) l = xml.sax.xmlreader.Locator() xh.setDocumentLocator(l) #parser.parse(StringIO(xml_string)) parser.parse(xml_string) print "parsing done" #print type(xh.root.us_bibliographic_data_grant.publication_reference.contents_of('document_id', '', as_string=False)) print xh.root.claims.contents_of('claim', '', as_string=True, upper=False) #print type(xh.root.us_bibliographic_data_grant.publication_reference.contents_of('document_id', '', as_string=True)) #print xh.root.us_bibliographic_data_grant.publication_reference.contents_of('document_id', '', as_string=True)
def __init__(self, xml_string, filename, is_string=False): xh = xml_driver.XMLHandler() parser = xml_driver.make_parser() parser.setContentHandler(xh) parser.setFeature(xml_driver.handler.feature_external_ges, False) l = xml.sax.xmlreader.Locator() xh.setDocumentLocator(l) if is_string: parser.parse(StringIO(xml_string)) else: parser.parse(xml_string) self.attributes = [ 'app', 'application', 'assignee_list', 'inventor_list', 'us_classifications', 'claims' ] self.xml = xh.root.patent_application_publication self.xml_string = xml_string if filter(lambda x: not isinstance(x, list), self.xml.contents_of('country_code')): self.country = filter(lambda x: not isinstance(x, list), self.xml.contents_of('country_code'))[0] else: self.country = '' self.application = xml_util.normalize_document_identifier( self.xml.document_id.contents_of('doc_number')[0]) self.kind = self.xml.document_id.contents_of('kind_code')[0] try: self.pat_type = type_kind[self.kind] except: self.pat_type = None self.date_app = self.xml.document_id.contents_of('document_date')[0] self.clm_num = len(self.xml.subdoc_claims.claim) #self.abstract = self.xml.subdoc_abstract.contents_of('paragraph', '', as_string=True, upper=False) try: self.abstract = re.search( '<subdoc-abstract>(.*?)</subdoc-abstract>', xml_string, re.DOTALL).group(1) self.abstract = re.sub('<.*?>|</.*?>', '', self.abstract) self.abstract = re.sub('[\n\t\r\f]+', '', self.abstract) self.abstract = re.sub('\s+', ' ', self.abstract) self.abstract = h.unescape(self.abstract) except: self.abstract = '' self.invention_title = h.unescape(self._invention_title()) self.filename = re.search('i?pa[0-9]*.*$', filename, re.DOTALL).group() self.app = { "id": self.application, "type": self.pat_type, "number": self.application, "country": self.country, "date": self._fix_date(self.date_app), "abstract": self.abstract, "title": self.invention_title, "kind": self.kind, "num_claims": self.clm_num, "filename": self.filename } self.app["id"] = str(self.app["date"])[:4] + "/" + self.app["number"]