def test_normalize_document_identifier2(self): teststring = "US5260728" resstring = xml_util.normalize_document_identifier(teststring) self.assertTrue(resstring == "US5260728") teststring = "USD5260728" resstring = xml_util.normalize_document_identifier(teststring) self.assertTrue(resstring == "USD5260728")
def test_normalize_document_identifier3(self): teststring = "D0123456" resstring = xml_util.normalize_document_identifier(teststring) self.assertTrue(resstring == "D123456", resstring) teststring = "D123456" resstring = xml_util.normalize_document_identifier(teststring) self.assertTrue(resstring == "D123456", resstring)
def __init__(self, filename, is_string=False): xh = xml_driver.XMLHandler() parser = xml_driver.make_parser() parser.setContentHandler(xh) parser.setFeature(xml_driver.handler.feature_external_ges, False) l = xml.sax.xmlreader.Locator() xh.setDocumentLocator(l) if is_string: parser.parse(cStringIO.StringIO(filename)) else: parser.parse(filename) self.xml = xh.root.us_patent_grant.us_bibliographic_data_grant self.country = self.xml.publication_reference.contents_of('country')[0] self.patent = xml_util.normalize_document_identifier(self.xml.publication_reference.contents_of('doc_number')[0]) self.kind = self.xml.publication_reference.contents_of('kind')[0] self.date_grant = self.xml.publication_reference.contents_of('date')[0] self.pat_type = self.xml.application_reference[0].get_attribute('appl-type') self.date_app = self.xml.application_reference.contents_of('date')[0] self.country_app = self.xml.application_reference.contents_of('country')[0] self.patent_app = self.xml.application_reference.contents_of('doc_number')[0] self.code_app = self.xml.contents_of('us_application_series_code')[0] self.clm_num = self.xml.contents_of('number_of_claims')[0] self.classes = self._classes() self.abstract = xh.root.us_patent_grant.abstract.contents_of('p','') self.invention_title = self._invention_title() self.asg_list = self._asg_list() self.cit_list = self._cit_list() self.rel_list = self._rel_list() self.inv_list = self._inv_list() self.law_list = self._law_list()
def __init__(self, xml_string, filename, is_string=False): xh = xml_driver.XMLHandler() parser = xml_driver.make_parser() parser.setContentHandler(xh) parser.setFeature(xml_driver.handler.feature_external_ges, False) l = xml.sax.xmlreader.Locator() xh.setDocumentLocator(l) if is_string: parser.parse(StringIO(xml_string)) else: parser.parse(xml_string) self.attributes = ['app','application','assignee_list','inventor_list', 'us_classifications', 'claims'] self.xml = xh.root.patent_application_publication self.xml_string = xml_string if filter(lambda x: not isinstance(x, list), self.xml.contents_of('country_code')): self.country = filter(lambda x: not isinstance(x, list), self.xml.contents_of('country_code'))[0] else: self.country = '' self.application = xml_util.normalize_document_identifier(self.xml.document_id.contents_of('doc_number')[0]) self.kind = self.xml.document_id.contents_of('kind_code')[0] try: self.pat_type = type_kind[self.kind] except: self.pat_type = None self.date_app = self.xml.document_id.contents_of('document_date')[0] self.clm_num = len(self.xml.subdoc_claims.claim) #self.abstract = self.xml.subdoc_abstract.contents_of('paragraph', '', as_string=True, upper=False) try: self.abstract = re.search('<subdoc-abstract>(.*?)</subdoc-abstract>',xml_string,re.DOTALL).group(1) self.abstract = re.sub('<.*?>|</.*?>','',self.abstract) self.abstract = re.sub('[\n\t\r\f]+','',self.abstract) self.abstract = re.sub('\s+',' ',self.abstract) self.abstract = h.unescape(self.abstract) except: self.abstract = '' self.invention_title = h.unescape(self._invention_title()) self.filename = re.search('i?pa[0-9]*.*$',filename,re.DOTALL).group() self.app = { "id": self.application, "type": self.pat_type, "number": self.application, "country": self.country, "date": self._fix_date(self.date_app), "abstract": self.abstract, "title": self.invention_title, "kind": self.kind, "num_claims": self.clm_num, "filename": self.filename } self.app["id"] = str(self.app["date"])[:4] + "/" + self.app["number"]
def __init__(self, xml_string, filename, is_string=False): xh = xml_driver.XMLHandler() parser = xml_driver.make_parser() parser.setContentHandler(xh) parser.setFeature(xml_driver.handler.feature_external_ges, False) l = xml.sax.xmlreader.Locator() xh.setDocumentLocator(l) if is_string: parser.parse(StringIO(xml_string)) else: parser.parse(xml_string) self.attributes = ['pat','app','assignee_list','patent','inventor_list','lawyer_list', 'us_relation_list','us_classifications','ipcr_classifications', 'citation_list','claims'] self.xml = xh.root.us_patent_grant self.xml_string = xml_string self.country = self.xml.publication_reference.contents_of('country', upper=False)[0] self.patent = xml_util.normalize_document_identifier(self.xml.publication_reference.contents_of('doc_number')[0]) self.kind = self.xml.publication_reference.contents_of('kind')[0] self.date_grant = self.xml.publication_reference.contents_of('date')[0] if self.xml.application_reference: self.pat_type = self.xml.application_reference[0].get_attribute('appl-type', upper=False) else: self.pat_type = None self.date_app = self.xml.application_reference.contents_of('date')[0] self.country_app = self.xml.application_reference.contents_of('country')[0] self.patent_app = self.xml.application_reference.contents_of('doc_number')[0] self.code_app = self.xml.contents_of('us_application_series_code')[0] self.clm_num = self.xml.contents_of('number_of_claims')[0] self.abstract = h.unescape(xh.root.us_patent_grant.abstract.contents_of('p', '', as_string=True, upper=False)) self.invention_title = h.unescape(self._invention_title()) self.filename = re.search('ipg.*$',filename,re.DOTALL).group() self.pat = { "id": self.patent, "type": self.pat_type, "number": self.patent, "country": self.country, "date": self._fix_date(self.date_grant), "abstract": self.abstract, "title": self.invention_title, "kind": self.kind, "num_claims": self.clm_num, "filename": self.filename } self.app = { "type": self.code_app, "number": self.patent_app, "country": self.country_app, "date": self._fix_date(self.date_app) } self.app["id"] = str(self.app["date"])[:4] + "/" + self.app["number"]
def _get_doc_info(self, root): """ Accepts an XMLElement root as an argument. Returns list of [country, doc-number, kind, date] for the given root """ res = {} for tag in ['country', 'kind', 'date']: data = root.contents_of(tag) res[tag] = data[0] if data else '' res['number'] = xml_util.normalize_document_identifier( root.contents_of('doc_number')[0]) return res
def citation_list(self): """ Returns a list of two lists. The first list is normal citations, the second is other citations. citation: date name kind country category number sequence OR otherreference: text sequence """ citations = self.xml.references_cited.citation if not citations: return [[], []] regular_cits = [] other_cits = [] ocnt = 0 ccnt = 0 for citation in citations: data = {} if citation.othercit: data['text'] = citation.contents_of('othercit', as_string=True, upper=False) if any(data.values()): data['sequence'] = ocnt data['uuid'] = str(uuid.uuid1()) other_cits.append(data) ocnt += 1 else: for tag in ['kind', 'category']: data[tag] = citation.contents_of(tag, as_string=True, upper=False) data['date'] = self._fix_date( citation.contents_of('date', as_string=True)) data['country'] = citation.contents_of('country', default=[''])[0] doc_number = citation.contents_of('doc_number', as_string=True) data['number'] = xml_util.normalize_document_identifier( doc_number) if any(data.values()): data['sequence'] = ccnt data['uuid'] = str(uuid.uuid1()) regular_cits.append(data) ccnt += 1 return [regular_cits, other_cits]
def __init__(self, xml_string, is_string=False): xh = xml_driver.XMLHandler() parser = xml_driver.make_parser() parser.setContentHandler(xh) parser.setFeature(xml_driver.handler.feature_external_ges, False) l = xml.sax.xmlreader.Locator() xh.setDocumentLocator(l) if is_string: parser.parse(StringIO(xml_string)) else: parser.parse(xml_string) self.attributes = [ 'app', 'application', 'assignee_list', 'inventor_list', 'us_classifications', 'claims' ] self.xml = xh.root.patent_application_publication if filter(lambda x: not isinstance(x, list), self.xml.contents_of('country_code')): self.country = filter(lambda x: not isinstance(x, list), self.xml.contents_of('country_code'))[0] else: self.country = '' self.application = xml_util.normalize_document_identifier( self.xml.application_number.contents_of('doc_number')[0]) self.kind = self.xml.document_id.contents_of('kind_code')[0] self.pat_type = None self.date_app = self.xml.domestic_filing_data.contents_of( 'filing_date')[0] self.clm_num = len(self.xml.subdoc_claims.claim) self.abstract = self.xml.subdoc_abstract.contents_of('paragraph', '', as_string=True, upper=False) self.invention_title = self._invention_title() self.app = { "id": self.application, "type": self.pat_type, "number": self.application[2:] + '/' + self.application[2:], "country": self.country, "date": self._fix_date(self.date_app), "abstract": self.abstract, "title": self.invention_title, "kind": self.kind, "num_claims": self.clm_num } self.app["id"] = str(self.date_app)[:4] + '/' + self.application
def __init__(self, xml_string, is_string=False): xh = xml_driver.XMLHandler() parser = xml_driver.make_parser() parser.setContentHandler(xh) parser.setFeature(xml_driver.handler.feature_external_ges, False) l = xml.sax.xmlreader.Locator() xh.setDocumentLocator(l) if is_string: parser.parse(StringIO(xml_string)) else: parser.parse(xml_string) self.attributes = [ 'app', 'application', 'assignee_list', 'inventor_list', 'us_classifications', 'claims' ] self.xml = xh.root.us_patent_application self.country = self.xml.application_reference.contents_of( 'country', upper=False)[0] self.application = xml_util.normalize_document_identifier( self.xml.application_reference.contents_of('doc_number')[0]) self.kind = self.xml.publication_reference.contents_of('kind')[0] self.date_app = self.xml.application_reference.contents_of('date')[0] if self.xml.application_reference: self.pat_type = self.xml.application_reference[0].get_attribute( 'appl-type', upper=False) else: self.pat_type = None self.clm_num = len(self.xml.claims.claim) self.abstract = self.xml.abstract.contents_of('p', '', as_string=True, upper=False) self.invention_title = self._invention_title() self.app = { "id": self.application, "type": self.pat_type, "number": self.application[2:] + '/' + self.application[2:], "country": self.country, "date": self._fix_date(self.date_app), "abstract": self.abstract, "title": self.invention_title, "kind": self.kind, "num_claims": self.clm_num } self.app["id"] = str(self.date_app)[:4] + '/' + self.application
def _get_doc_info(self, root): """ Accepts an XMLElement root as an argument. Returns list of [country, doc-number, kind, date] for the given root """ res = {} country = root.contents_of('country_code')[0] if root.contents_of('country_code') else '' kind = root.contents_of('kind_code')[0] if root.contents_of('kind_code') else '' date = root.contents_of('document_date')[0] if root.contents_of('document_date') else '' res['country'] = country if country else '' res['kind'] = kind if kind else '' res['date'] = date if date else '' res['number'] = xml_util.normalize_document_identifier( root.contents_of('doc_number')[0]) return res
def citation_list(self): """ Returns a list of two lists. The first list is normal citations, the second is other citations. citation: date name kind country category number sequence OR otherreference: text sequence """ citations = self.xml.references_cited.citation if not citations: return [[], []] regular_cits = [] other_cits = [] ocnt = 0 ccnt = 0 for citation in citations: data = {} if citation.othercit: data['text'] = citation.contents_of('othercit', as_string=True, upper=False) if any(data.values()): data['sequence'] = ocnt data['uuid'] = str(uuid.uuid4()) other_cits.append(data) ocnt += 1 else: for tag in ['kind', 'category']: data[tag] = citation.contents_of(tag, as_string=True, upper=False) data['date'] = self._fix_date(citation.contents_of('date', as_string=True)) data['country'] = citation.contents_of('country', default=[''])[0] doc_number = citation.contents_of('doc_number', as_string=True) data['number'] = xml_util.normalize_document_identifier(doc_number) if any(data.values()): data['sequence'] = ccnt data['uuid'] = str(uuid.uuid4()) regular_cits.append(data) ccnt += 1 return [regular_cits, other_cits]
def _get_doc_info(self, root): """ Accepts an XMLElement root as an argument. Returns list of [country, doc-number, kind, date] for the given root """ res = {} country = root.contents_of('country_code')[0] if root.contents_of( 'country_code') else '' kind = root.contents_of('kind_code')[0] if root.contents_of( 'kind_code') else '' date = root.contents_of('document_date')[0] if root.contents_of( 'document_date') else '' res['country'] = country if country else '' res['kind'] = kind if kind else '' res['date'] = date if date else '' res['number'] = xml_util.normalize_document_identifier( root.contents_of('doc_number')[0]) return res
def __init__(self, xml_string, is_string=False): xh = xml_driver.XMLHandler() parser = xml_driver.make_parser() parser.setContentHandler(xh) parser.setFeature(xml_driver.handler.feature_external_ges, False) l = xml.sax.xmlreader.Locator() xh.setDocumentLocator(l) if is_string: parser.parse(StringIO(xml_string)) else: parser.parse(xml_string) self.attributes = ['app','application','assignee_list','inventor_list', 'us_classifications', 'claims'] self.xml = xh.root.us_patent_application self.country = self.xml.publication_reference.contents_of('country', upper=False)[0] self.application = xml_util.normalize_document_identifier(self.xml.publication_reference.contents_of('doc_number')[0]) self.kind = self.xml.publication_reference.contents_of('kind')[0] if self.xml.application_reference: self.pat_type = self.xml.application_reference[0].get_attribute('appl-type', upper=False) else: self.pat_type = None self.date_app = self.xml.publication_reference.contents_of('date')[0] self.clm_num = len(self.xml.claims.claim) self.abstract = self.xml.abstract.contents_of('p', '', as_string=True, upper=False) self.invention_title = self._invention_title() self.app = { "id": self.application, "type": self.pat_type, "number": self.application, "country": self.country, "date": self._fix_date(self.date_app), "abstract": self.abstract, "title": self.invention_title, "kind": self.kind, "num_claims": self.clm_num } self.app["id"] = str(self.app["date"])[:4] + "/" + self.app["number"]
def __init__(self, xml_string, is_string=False): xh = xml_driver.XMLHandler() parser = xml_driver.make_parser() parser.setContentHandler(xh) parser.setFeature(xml_driver.handler.feature_external_ges, False) l = xml.sax.xmlreader.Locator() xh.setDocumentLocator(l) if is_string: parser.parse(StringIO(xml_string)) else: parser.parse(xml_string) self.attributes = ['app','application','assignee_list','inventor_list', 'us_relation_list','us_classifications','ipcr_classifications', 'claims'] self.xml = xh.root.patent_application_publication if filter(lambda x: not isinstance(x, list), self.xml.contents_of('country_code')): self.country = filter(lambda x: not isinstance(x, list), self.xml.contents_of('country_code'))[0] else: self.country = '' self.application = xml_util.normalize_document_identifier(self.xml.document_id.contents_of('doc_number')[0]) self.kind = self.xml.document_id.contents_of('kind_code')[0] self.pat_type = None self.date_app = self.xml.document_id.contents_of('document_date')[0] self.clm_num = len(self.xml.subdoc_claims.claim) self.abstract = self.xml.subdoc_abstract.contents_of('paragraph', '', as_string=True, upper=False) self.invention_title = self._invention_title() self.app = { "id": self.application, "type": self.pat_type, "number": self.application, "country": self.country, "date": self._fix_date(self.date_app), "abstract": self.abstract, "title": self.invention_title, "kind": self.kind, "num_claims": self.clm_num } self.app["id"] = str(self.app["date"])[:4] + "/" + self.app["number"]
def citation_list(self): """ Returns a list of two lists. The first list is normal citations, the second is other citations. citation: date name kind country category number sequence OR otherreference: text sequence """ citations = self.xml.references_cited.citation if not citations: return [] regular_cits = [] other_cits = [] for i,citation in enumerate(citations): data = {} if citation.othercit: data['text'] = citation.contents_of('othercit', as_string=True) data['sequence'] = i other_cits.append(data) else: for tag in ['name','kind','category']: data[tag] = citation.contents_of(tag, as_string=True) data['date'] = self._fix_date(citation.contents_of('date', as_string=True)) data['country'] = citation.contents_of('country', default=[''])[0] doc_number = citation.contents_of('doc_number', as_string=True) data['number'] = xml_util.normalize_document_identifier(doc_number) data['sequence'] = i regular_cits.append(data) return [regular_cits, other_cits]
def __init__(self, xml_string, is_string=False): xml_string = self._description_patch(xml_string) xml_string = self._claims_patch(xml_string) xml_string = self._abstract_patch(xml_string) xh = xml_driver.XMLHandler() self.xh = xh parser = xml_driver.make_parser() parser.setContentHandler(xh) parser.setFeature(xml_driver.handler.feature_external_ges, False) l = xml.sax.xmlreader.Locator() xh.setDocumentLocator(l) if is_string: parser.parse(StringIO(xml_string)) else: parser.parse(xml_string) self.attributes = ['pat','app','description','assignee_list','patent','inventor_list','lawyer_list', 'us_relation_list','us_classifications','ipcr_classifications', 'citation_list','claims'] self.xml = xh.root.us_patent_grant self.country = self.xml.publication_reference.contents_of('country', upper=False)[0] self.patent = xml_util.normalize_document_identifier(self.xml.publication_reference.contents_of('doc_number')[0]) self.kind = self.xml.publication_reference.contents_of('kind')[0] self.date_grant = self.xml.publication_reference.contents_of('date')[0] if self.xml.application_reference: self.pat_type = self.xml.application_reference[0].get_attribute('appl-type', upper=False) else: self.pat_type = None self.date_app = self.xml.application_reference.contents_of('date')[0] self.country_app = self.xml.application_reference.contents_of('country')[0] self.patent_app = self.xml.application_reference.contents_of('doc_number')[0] self.code_app = self.xml.contents_of('us_application_series_code')[0] self.clm_num = self.xml.contents_of('number_of_claims')[0] self.abstract = xh.root.us_patent_grant.contents_of('abstract', '', as_string=True, upper=False) self.briefsummarydescription = xh.root.us_patent_grant.contents_of('brief_summary_description', '', as_string=True, upper=False, clean_text=False) self.briefdescriptiondrawings = xh.root.us_patent_grant.contents_of('brief_description_drawings', '', as_string=True, upper=False, clean_text=False) self.detaileddescription = xh.root.us_patent_grant.contents_of('detailed_description', '', as_string=True, upper=False, clean_text=False) self.otherpatentrelations = xh.root.us_patent_grant.contents_of('other_patent_relations', '', as_string=True, upper=False, clean_text=False) self.invention_title = self._invention_title() self.pat = { "id": self.patent, "type": self.pat_type, "number": self.patent, "country": self.country, "date": self._fix_date(self.date_grant), "abstract": self.abstract, "title": self.invention_title, "kind": self.kind, "num_claims": self.clm_num } self.app = { "type": self.code_app, "number": self.patent_app, "country": self.country_app, "date": self._fix_date(self.date_app) } self.app["id"] = str(self.app["date"])[:4] + "/" + self.app["number"] self.description = { "id": self.patent, "briefsummarydescription": self.briefsummarydescription, "briefdescriptiondrawings": self.briefdescriptiondrawings, "detaileddescription": self.detaileddescription, "otherpatentrelations": self.otherpatentrelations }
def __init__(self, xml_string, filename, is_string=False): xh = xml_driver.XMLHandler() parser = xml_driver.make_parser() parser.setContentHandler(xh) parser.setFeature(xml_driver.handler.feature_external_ges, False) l = xml.sax.xmlreader.Locator() xh.setDocumentLocator(l) if is_string: parser.parse(StringIO(xml_string)) else: parser.parse(xml_string) self.attributes = [ 'app', 'application', 'assignee_list', 'inventor_list', 'us_classifications', 'claims' ] self.xml = xh.root.patent_application_publication self.xml_string = xml_string if filter(lambda x: not isinstance(x, list), self.xml.contents_of('country_code')): self.country = filter(lambda x: not isinstance(x, list), self.xml.contents_of('country_code'))[0] else: self.country = '' self.application = xml_util.normalize_document_identifier( self.xml.document_id.contents_of('doc_number')[0]) self.kind = self.xml.document_id.contents_of('kind_code')[0] try: self.pat_type = type_kind[self.kind] except: self.pat_type = None self.date_app = self.xml.document_id.contents_of('document_date')[0] self.clm_num = len(self.xml.subdoc_claims.claim) #self.abstract = self.xml.subdoc_abstract.contents_of('paragraph', '', as_string=True, upper=False) try: self.abstract = re.search( '<subdoc-abstract>(.*?)</subdoc-abstract>', xml_string, re.DOTALL).group(1) self.abstract = re.sub('<.*?>|</.*?>', '', self.abstract) self.abstract = re.sub('[\n\t\r\f]+', '', self.abstract) self.abstract = re.sub('\s+', ' ', self.abstract) self.abstract = h.unescape(self.abstract) except: self.abstract = '' self.invention_title = h.unescape(self._invention_title()) self.filename = re.search('i?pa[0-9]*.*$', filename, re.DOTALL).group() self.app = { "id": self.application, "type": self.pat_type, "number": self.application, "country": self.country, "date": self._fix_date(self.date_app), "abstract": self.abstract, "title": self.invention_title, "kind": self.kind, "num_claims": self.clm_num, "filename": self.filename } self.app["id"] = str(self.app["date"])[:4] + "/" + self.app["number"]