def __init__(self, operation): if isinstance(operation['path'], JsonPointer): self.location = operation['path'].path self.pointer = operation['path'] else: self.location = operation['path'] self.pointer = JsonPointer(self.location) self.operation = operation
def query_ops(query, limit=50): #print 'query:', query response = ops_published_data_search('biblio', query, '1-{0}'.format(limit)) #print response pointer_total_count = JsonPointer('/ops:world-patent-data/ops:biblio-search/@total-result-count') total_count = int(pointer_total_count.resolve(response)) log.info('query: %s, total_count: %s', query, total_count) return response, total_count
def apply(self, obj): from_ptr = JsonPointer(self.operation['from']) subobj, part = from_ptr.to_last(obj) value = copy.deepcopy(subobj[part]) obj = AddOperation({ 'op': 'add', 'path': self.location, 'value': value }).apply(obj) return obj
def createOffsetMeta(offset, bookkeeping): ''' sets up a location to track rule and step ids for a given scope offset ''' pointer = JsonPointer(offset) view = bookkeeping for x in pointer.parts: view = view.setdefault(x, {}) pointer.resolve(bookkeeping).setdefault('_meta', { 'stages': [], 'steps': [] })
def resolve(self, doc, default=jsonpointer._nothing): if self.isHash: if len(self.parts) == 1: refdata = doc else: p = JsonPointer('/' + '/'.join(self.parts[:-1])) refdata = p.resolve(doc) if isinstance(refdata, list): return int(self.parts[-1]) else: return self.parts[-1] else: return super(RelJsonPointer, self).resolve(doc, default)
def read(self): pointer_results = JsonPointer( '/ops:world-patent-data/ops:register-search/reg:register-documents' ) register_documents = to_list(pointer_results.resolve(self.data)) for register_document in register_documents: item = OPSRegisterDocument() try: item.read(register_document) self.results.append(item) except JsonPointerException as ex: logger.warning( 'Could not read register information from data "{}": {}\n{}' .format(register_document, ex, exception_traceback()))
def _create_json_pointer(self, pointer_string): try: json_pointer = str(pointer_string) # add leading slash if missing if json_pointer[0] != '/': json_pointer = '/' + json_pointer # remove slash from the end if exists if json_pointer[-1] == '/': json_pointer = json_pointer[:-1] json_pointer = JsonPointer(json_pointer) except Exception as e: raise RuntimeError("Invalid JSON pointer passed: {}, error: {}".format(pointer_string, e.message)) return json_pointer
def addRule(self, rule, offset="", identifier=None): """ add a DAG extension rule, possibly with a scope offset """ thisoffset = JsonPointer(offset) if offset != "": createIndexData(thisoffset.path, self.steps, self.values) createOffsetMeta(thisoffset.path, self.bookkeeper) offsetstage = OffsetStage(rule, self._makeoffset(offset), identifier=identifier) self.rules += [offsetstage] thisoffset.resolve(self.bookkeeper)["_meta"]["stages"] += [ offsetstage.identifier ] return offsetstage.identifier
def apply(self, obj): from_ptr = JsonPointer(self.operation['from']) subobj, part = from_ptr.to_last(obj) try: value = copy.deepcopy(subobj[part]) except (KeyError, IndexError) as ex: raise JsonPatchConflict(str(ex)) obj = AddOperation({ 'op': 'add', 'path': self.location, 'value': value }).apply(obj) return obj
def validate_json_pointer(cls, value: str) -> str: """Validate JSON pointer :param value: input value :type value: str :raises ValueError: when fails validation :return: input value :rtype: str """ try: JsonPointer(value) except JsonPointerException as exception: raise ValueError from exception return value
def __init__(self, operation): if not operation.__contains__('path'): raise InvalidJsonPatch("Operation must have a 'path' member") if isinstance(operation['path'], JsonPointer): self.location = operation['path'].path self.pointer = operation['path'] else: self.location = operation['path'] try: self.pointer = JsonPointer(self.location) except TypeError as ex: raise InvalidJsonPatch("Invalid 'path'") self.operation = operation
def apply(self, obj): from_ptr = JsonPointer(self.operation['from']) subobj, part = from_ptr.to_last(obj) value = subobj[part] if self.pointer.contains(from_ptr): raise JsonPatchException( 'Cannot move values into its own children') obj = RemoveOperation({ 'op': 'remove', 'path': self.operation['from'] }).apply(obj) obj = AddOperation({ 'op': 'add', 'path': self.location, 'value': value }).apply(obj) return obj
def __init__(self, basepointer, relpointer): if basepointer is None: basepointer = '' super(RelJsonPointer, self).__init__(basepointer) self.isHash = False if num_re.match(relpointer): uplevels = int(relpointer) relparts = [] elif num_hash_re.match(relpointer): uplevels = int(relpointer[:-1]) relparts = [] self.isHash = True elif num_rel_re.match(relpointer): (uplevels, relpath) = relpointer.split('/', 1) uplevels = int(uplevels) relparts = JsonPointer('/' + relpath).parts else: raise JsonPointerException("Invalid relative JSON pointer '%s', " % relpointer) if uplevels > 0: if uplevels > len(self.parts): raise JsonPointerException( "Base pointer '%s' is not deep enough for " "relative pointer '%s' levels" % (basepointer, relpointer)) self.parts = self.parts[0:-uplevels] if self.isHash and len(self.parts) == 0: raise JsonPointerException( "Cannot use '#' at root of relative JSON pointer '%s', " % relpointer) self.parts.extend(relparts)
def test_round_trip(self): paths = [ "", "/foo", "/foo/0", "/", "/a~1b", "/c%d", "/e^f", "/g|h", "/i\\j", "/k\"l", "/ ", "/m~0n", '/\xee', ] for path in paths: ptr = JsonPointer(path) self.assertEqual(path, ptr.path) parts = ptr.parts new_ptr = JsonPointer.from_parts(parts) self.assertEqual(ptr, new_ptr)
def apply(self, obj): from_ptr = JsonPointer(self.operation['from']) subobj, part = from_ptr.to_last(obj) try: value = subobj[part] except (KeyError, IndexError) as ex: raise JsonPatchConflict(str(ex)) if isinstance(subobj, dict) and self.pointer.contains(from_ptr): raise JsonPatchException('Cannot move values into its own children') obj = RemoveOperation({ 'op': 'remove', 'path': self.operation['from'] }).apply(obj) obj = AddOperation({ 'op': 'add', 'path': self.location, 'value': value }).apply(obj) return obj
def ops_published_data_search_real(constituents, query, range): # OPS client object, impersonated for the current user. ops = get_ops_client() # Send request to OPS. range_begin, range_end = map(int, range.split('-')) response = ops.published_data_search( query, range_begin=range_begin, range_end=range_end, constituents=to_list(constituents)) # Decode OPS response from JSON payload = handle_response(response, 'ops-search') if response.headers['content-type'].startswith('application/json'): # Decode total number of results. pointer_total_count = JsonPointer('/ops:world-patent-data/ops:biblio-search/@total-result-count') count_total = int(pointer_total_count.resolve(payload)) # Raise an exception to skip caching empty results. if count_total == 0: raise NoResultsException('No results', data=payload) return payload
def apply(self, obj): try: if isinstance(self.operation['from'], JsonPointer): from_ptr = self.operation['from'] else: from_ptr = JsonPointer(self.operation['from']) except KeyError as ex: raise InvalidJsonPatch( "The operation does not contain a 'from' member") subobj, part = from_ptr.to_last(obj) try: value = subobj[part] except (KeyError, IndexError) as ex: raise JsonPatchConflict(str(ex)) # If source and target are equal, this is a no-op if self.pointer == from_ptr: return obj if isinstance(subobj, MutableMapping) and \ self.pointer.contains(from_ptr): raise JsonPatchConflict('Cannot move values into their own children') obj = RemoveOperation({ 'op': 'remove', 'path': self.operation['from'] }).apply(obj) obj = AddOperation({ 'op': 'add', 'path': self.location, 'value': value }).apply(obj) return obj
def from_path(self): from_ptr = JsonPointer(self.operation['from']) return '/'.join(from_ptr.parts[:-1])
class OPSRegisterDocumentDecoder: """ Functions for decoding data from raw JSON OPS register documents. """ # Biblio container pointer_bibliographic_data = JsonPointer( '/reg:register-document/reg:bibliographic-data') # Discrete values pointer_status = JsonPointer('/reg:register-document/@status') pointer_filing_language = JsonPointer( '/reg:register-document/reg:bibliographic-data/reg:language-of-filing/$' ) # Historic data pointer_publication_reference = JsonPointer( '/reg:register-document/reg:bibliographic-data/reg:publication-reference' ) pointer_application_reference = JsonPointer( '/reg:register-document/reg:bibliographic-data/reg:application-reference' ) pointer_designated_states = JsonPointer( '/reg:register-document/reg:bibliographic-data/reg:designation-of-states' ) pointer_applicants = JsonPointer( '/reg:register-document/reg:bibliographic-data/reg:parties/reg:applicants' ) pointer_inventors = JsonPointer( '/reg:register-document/reg:bibliographic-data/reg:parties/reg:inventors' ) pointer_agents = JsonPointer( '/reg:register-document/reg:bibliographic-data/reg:parties/reg:agents') pointer_term_of_grant = JsonPointer( '/reg:register-document/reg:bibliographic-data/reg:term-of-grant') pointer_licensee_data = JsonPointer( '/reg:register-document/reg:bibliographic-data/reg:licensee-data') pointer_related_documents = JsonPointer( '/reg:register-document/reg:bibliographic-data/reg:related-documents/reg:division/reg:relation' ) pointer_bio_deposit = JsonPointer( '/reg:register-document/reg:bibliographic-data/reg:bio-deposit') # Actions pointer_dates_rights_effective = JsonPointer( '/reg:register-document/reg:bibliographic-data/reg:dates-rights-effective' ) pointer_opposition_data = JsonPointer( '/reg:register-document/reg:bibliographic-data/reg:opposition-data') pointer_ep_patent_statuses = JsonPointer( '/reg:register-document/reg:ep-patent-statuses/reg:ep-patent-status') """ TODO: # EP14879896P u'reg:office-specific-bib-data': """ @classmethod def status(cls, data): """ Decode register status. """ return cls.pointer_status.resolve(data) @classmethod def filing_language(cls, data): """ Decode filing language. """ return cls.pointer_filing_language.resolve(data) @classmethod def actions(cls, data): """ Decode action information from different places scattered around the OPS Exchange Document: - dates-rights-effective - opposition-data - date-application-deemed-withdrawn - date-application-withdrawn-by-applicant - ep-patent-statuses """ # Information from all actions actions = [] """ # ap=EP88402018 u'reg:dates-rights-effective': {u'reg:first-examination-report-despatched': {u'reg:date': {u'$': u'19901213'}}, u'reg:request-for-examination': {u'@change-gazette-num': u'1989/41', u'reg:date': {u'$': u'19890628'}}}, """ try: for name, item in cls.pointer_dates_rights_effective.resolve( data).items(): entry = cls.decode_action('dates-rights-effective', name, item) actions.append(entry) except JsonPointerException: pass """ # ap=EP86400204 u'reg:opposition-data': {u'@change-date': u'19890321', u'@change-gazette-num': u'1989/19', u'reg:opposition-not-filed': {u'reg:date': {u'$': u'19890221'}}}, """ try: opposition_data = cls.pointer_opposition_data.resolve(data) # Transform entry into baseline format like "reg:dates-rights-effective" change_fields = ['@change-date', '@change-gazette-num'] change_data = {} for change_field in change_fields: change_data[change_field] = opposition_data[change_field] del opposition_data[change_field] for name, item in opposition_data.items(): for key, value in change_data.items(): item.setdefault(key, value) entry = cls.decode_action('opposition-data', name, item) actions.append(entry) except JsonPointerException: pass """ # TA=lentille u'reg:date-application-deemed-withdrawn': {u'@change-gazette-num': u'2009/11', u'reg:date': {u'$': u'20080909'}}, # TA=lentille u'reg:date-application-withdrawn-by-applicant': {u'@change-gazette-num': u'2012/35'}, """ deemed_withdrawn_nodes = [ 'reg:date-application-deemed-withdrawn', 'reg:date-application-withdrawn-by-applicant' ] bibliographic_data = cls.pointer_bibliographic_data.resolve(data) for nodename in deemed_withdrawn_nodes: #print 'nodename:', nodename #print 'bibdate:', bibliographic_data if nodename in bibliographic_data: kind = 'withdrawn-dates' name = nodename.replace('reg:', '') item = bibliographic_data[nodename] entry = cls.decode_action(kind, name, item) actions.append(entry) """ # EP2699357, id=EP12715599P u'reg:ep-patent-statuses': {u'reg:ep-patent-status': [{u'$': u'No opposition filed within time limit', u'@change-date': u'20171208', u'@status-code': u'7'}, {u'$': u'The patent has been granted', u'@change-date': u'20161230', u'@status-code': u'8'}, {u'$': u'Grant of patent is intended', u'@change-date': u'20161223', u'@status-code': u'12'}]}}} """ ep_patent_statuses = to_list( cls.pointer_ep_patent_statuses.resolve(data)) for item in ep_patent_statuses: entry = OrderedDict() entry['kind'] = 'status' entry['name'] = item['$'] entry['date'] = entry[ 'change_date'] = OPSExchangeDocumentDecoder.decode_date( item.get('@change-date')) entry['status_code'] = item.get('@status-code') # Skip some status changes without "date" information as these won't be sortable if entry['date']: actions.append(entry) # Sort all entries by date in ascending order actions = sorted(actions, key=operator.itemgetter('date')) return actions @staticmethod def decode_action(kind, name, item): entry = OrderedDict() entry['kind'] = kind entry['name'] = name.replace('reg:', '') entry[ 'date'] = 'reg:date' in item and OPSExchangeDocumentDecoder.decode_date( item['reg:date']['$']) or None entry['change_date'] = OPSExchangeDocumentDecoder.decode_date( item.get('@change-date')) entry['change_gazette'] = item.get('@change-gazette-num') if not entry['date']: entry['date'] = entry['change_date'] return entry @classmethod def application_reference(cls, data): """ Decode publication reference from register document. """ try: nodes = to_list(cls.pointer_application_reference.resolve(data)) except JsonPointerException: return [] history = cls.read_history(nodes, 'reg:document-id', cls.decode_document_reference) history = list( reversed(sorted(history, key=operator.itemgetter('change_date')))) return history @classmethod def publication_reference(cls, data): """ Decode publication reference from register document. u'reg:publication-reference': [{u'@change-gazette-num': u'2014/30', u'reg:document-id': {u'@lang': u'de', u'reg:country': {u'$': u'WO'}, u'reg:date': {u'$': u'20140724'}, u'reg:doc-number': {u'$': u'2014111240'}, u'reg:kind': {u'$': u'A1'}}}, {u'@change-gazette-num': u'2015/48', u'reg:document-id': {u'@lang': u'de', u'reg:country': {u'$': u'EP'}, u'reg:date': {u'$': u'20151125'}, u'reg:doc-number': {u'$': u'2946041'}, u'reg:kind': {u'$': u'A1'}}}], """ try: nodes = to_list(cls.pointer_publication_reference.resolve(data)) except JsonPointerException: return [] history = cls.read_history(nodes, 'reg:document-id', cls.decode_document_reference) history = list( reversed(sorted(history, key=operator.itemgetter('change_date')))) return history @staticmethod def decode_document_reference(item): entry = OrderedDict() entry[ 'date'] = 'reg:date' in item and OPSExchangeDocumentDecoder.decode_date( item['reg:date']['$']) or None entry['number'] = item['reg:country']['$'] + item['reg:doc-number'][ '$'] + item.get('reg:kind', {}).get('$', '') return entry @classmethod def designated_states(cls, data): """ Decode designated states from register document. # TODO: Multiple designated states entries. e.g. EP16202765P """ try: nodes = to_list(cls.pointer_designated_states.resolve(data)) except JsonPointerException: return [] return cls.read_history(nodes, 'reg:designation-pct', cls.countries_designated) @classmethod def countries_designated(cls, node): """ Decode list of countries (designated states). """ return cls.decode_countries(node, '/reg:regional/reg:country') @classmethod def applicants(cls, data): """ Decode list of applicants """ try: nodes = to_list(cls.pointer_applicants.resolve(data)) except JsonPointerException: return [] return cls.read_history(nodes, 'reg:applicant', cls.parties) @classmethod def inventors(cls, data): """ Decode list of inventors """ try: nodes = to_list(cls.pointer_inventors.resolve(data)) except JsonPointerException: return [] return cls.read_history(nodes, 'reg:inventor', cls.parties) @classmethod def agents(cls, data): """ Decode list of agents """ try: nodes = to_list(cls.pointer_agents.resolve(data)) except JsonPointerException: return [] return cls.read_history(nodes, 'reg:agent', cls.parties) @classmethod def countries_lapsed(cls, data): """ Decode list of multiple "lapsed-in-country" entries # ap=EP08836401 u'reg:term-of-grant': [{u'@change-date': u'20140718', u'@change-gazette-num': u'2014/34', u'reg:lapsed-in-country': [{u'reg:country': {u'$': u'HU'}, u'reg:date': {u'$': u'20080709'}}, {u'reg:country': {u'$': u'AT'}, u'reg:date': {u'$': u'20120418'}}, """ try: nodes = to_list(cls.pointer_term_of_grant.resolve(data)) except JsonPointerException: return [] return cls.read_history(nodes, 'reg:lapsed-in-country', cls.lapsed_in_country) @classmethod def lapsed_in_country(cls, node): """ Decode list of "lapsed-in-country" entries. """ entries = to_list(node) data = [] for entry in entries: item = { 'country': entry['reg:country']['$'], 'date': OPSExchangeDocumentDecoder.decode_date(entry['reg:date']['$']), } data.append(item) return data @classmethod def parties(cls, node): """ Decode list of applicants, inventors or agents. """ entries = [] for party in to_list(node): addressbook = party['reg:addressbook'] # TODO: u'reg:nationality', u'reg:residence' entry = OrderedDict() entry['name'] = addressbook['reg:name']['$'] entry['country'] = addressbook['reg:address']['reg:country']['$'] address = [] for index in range(1, 7): fieldname = 'address-{}'.format(index) fieldname_ops = 'reg:{}'.format(fieldname) try: value = addressbook['reg:address'][fieldname_ops]['$'] address.append(value) except KeyError: pass entry['address'] = address entries.append(entry) return entries @classmethod def read_history(cls, nodes, node_name, item_decoder): """ Generically decode arbitrary lists based on the @change-date / @change-gazette-num scheme """ # Collect entries over time history = [] for node in nodes: entry = OrderedDict() entry['data'] = node_name in node and item_decoder( node[node_name]) or {} if '@change-date' in node: entry['change_date'] = OPSExchangeDocumentDecoder.decode_date( node['@change-date']) elif 'date' in entry['data']: entry['change_date'] = entry['data']['date'] entry['change_gazette'] = node.get('@change-gazette-num', 'N/P') history.append(entry) # Deduplicate entries. Sometimes, duplicate entries are in the history list, # one with 'change_gazette' == 'N/P' and another one with a real value, e.g. '1986/34'. # We want to choose the entry with the real value and suppress to other one, # but only if all the bibliographic data are equal. deduplicated = [] real = [] for entry in history: entry_dup = deepcopy(entry) del entry_dup['change_gazette'] if deduplicated: overwrite = (real[-1]['change_gazette'] == 'N/P' and real[-1]['change_date'] == entry['change_date']) and (deduplicated[-1] == entry_dup) if overwrite: real.pop() deduplicated.pop() if entry_dup not in deduplicated: deduplicated.append(entry_dup) real.append(entry) return real @classmethod def related_documents(cls, data): """ u'reg:related-documents': {u'reg:division': {u'reg:relation': {u'reg:child-doc': {u'reg:document-id': {u'reg:country': {u'$': u''}, u'reg:doc-number': {u'$': u''}}}, u'reg:parent-doc': {u'reg:document-id': [ {u'@document-id-type': u'application number', u'reg:country': {u'$': u'EP'}, u'reg:doc-number': {u'$': u'20110776418'}, u'reg:kind': {u'$': u'D'}}, {u'@document-id-type': u'publication number', u'reg:country': {u'$': u'EP'}, u'reg:doc-number': {u'$': u'20110776418'}, u'reg:kind': {u'$': u'D'}}, """ try: container = cls.pointer_related_documents.resolve(data) except JsonPointerException: return {} result = {} for relation, document in container.items(): relation = relation.replace('reg:', '').replace('-doc', '') result.setdefault(relation, {}) for document_id in to_list(document['reg:document-id']): if '@document-id-type' not in document_id: continue key = document_id['@document-id-type'].replace(' number', '') doc_number = document_id['reg:country']['$'] + document_id[ 'reg:doc-number']['$'] + document_id['reg:kind']['$'] result[relation][key] = doc_number return result @classmethod def licensee_data(cls, data): """ # EP2683490, id=EP12704680P u'reg:licensee-data': {u'@change-date': u'20141219', u'@change-gazette-num': u'2015/04', u'reg:licensee': {u'@designation': u'as-indicated', u'@sequence': u'01', u'@type-license': u'right-in-rem', u'reg:date': {u'$': u'20141212'}, u'reg:effective-in': {u'reg:country': [{u'$': u'AL'}, {u'$': u'AT'}, {u'$': u'BE'}, """ try: nodes = to_list(cls.pointer_licensee_data.resolve(data)) except JsonPointerException: return [] return cls.read_history(nodes, 'reg:licensee', cls.licensee_item) @classmethod def licensee_item(cls, node): item = OrderedDict() item['sequence'] = node['@sequence'] item['designation'] = node['@designation'] item['type'] = node['@type-license'] item['date'] = OPSExchangeDocumentDecoder.decode_date( node['reg:date']['$']) item['countries_effective'] = cls.countries_effective(node) return item @classmethod def countries_effective(cls, node): """ Decode list of countries (designated states). """ return cls.decode_countries(node, '/reg:effective-in/reg:country') @staticmethod def decode_countries(node, pointer): countries_pointer = JsonPointer(pointer) countries_raw = to_list(countries_pointer.resolve(node)) countries = [country_raw['$'] for country_raw in countries_raw] return countries @classmethod def bio_deposit(cls, data): """ # EP2699357, id=EP12715599P u'reg:bio-deposit': {u'@num': u'', u'reg:bio-accno': {u'$': u''}, u'reg:depositary': {u'$': u''}, u'reg:dtext': {u'$': u'one or more deposits'}}, """ try: node = cls.pointer_bio_deposit.resolve(data) except JsonPointerException: return {} data = OrderedDict() data['text'] = node['reg:dtext']['$'] data['depositary'] = node['reg:depositary']['$'] data['accno'] = node['reg:bio-accno']['$'] data['num'] = node['@num'] return data
def decode_countries(node, pointer): countries_pointer = JsonPointer(pointer) countries_raw = to_list(countries_pointer.resolve(node)) countries = [country_raw['$'] for country_raw in countries_raw] return countries
class OPSBiblioSearchResponse: """ Read the response from OPS published data search and decode the search results to a list of OPSExchangeDocument objects. """ # Some JSON pointers for accessing the innards of "ops:biblio-search" responses pointer_results = JsonPointer( '/ops:world-patent-data/ops:biblio-search/ops:search-result/exchange-documents' ) pointer_total_count = JsonPointer( '/ops:world-patent-data/ops:biblio-search/@total-result-count') pointer_range = JsonPointer( '/ops:world-patent-data/ops:biblio-search/ops:range') def __init__(self, data): self.data = data self.results = [] self.read() @property def total_result_count(self): """ Extract total result count from response. """ return int(self.pointer_total_count.resolve(self.data)) def read(self): """ Read list of result documents from response and create list of OPSExchangeDocument objects inside ``self.results``. """ exchange_documents = to_list(self.pointer_results.resolve(self.data)) for exchange_document in exchange_documents: item = OPSExchangeDocument() item.read(exchange_document) self.results.append(item) def merge_results(self, chunk): """ Merge results from another response chunk into the main list of results. This is used for crawling across all results from a search response when fetching chunks of 100 result documents each, as this is the maximum page size with the OPS API. """ # Merge result documents of chunk into main list of results main_results = to_list(self.pointer_results.resolve(self.data)) chunk_results = to_list(self.pointer_results.resolve(chunk)) main_results += chunk_results # Amend result data self.pointer_results.set(self.data, main_results, inplace=True) # Amend metadata new_total_count = str(len(main_results)) self.pointer_total_count.set(self.data, new_total_count) self.pointer_range.set(self.data, { '@begin': '1', '@end': new_total_count })
def get_path_tokens(self, path): return JsonPointer(path).parts
def get_from_path(self, doc, path): return JsonPointer(path).get(doc, default=None)
def __init__(self, operation): self.location = operation['path'] self.pointer = JsonPointer(self.location) self.operation = operation
class DpmaRegisterXmlDocument(object): """ Decode information from DPMAregister ST.36 XML document. """ _xml = attr.ib() _data = attr.ib(default=None) application_reference = attr.ib(default=attr.Factory(list)) publication_reference = attr.ib(default=attr.Factory(list)) title = attr.ib(default=attr.Factory(dict)) classifications = attr.ib(default=attr.Factory(dict)) pct_or_regional_data = attr.ib(default=attr.Factory(dict)) applicants = attr.ib(default=attr.Factory(list)) inventors = attr.ib(default=attr.Factory(list)) agents = attr.ib(default=attr.Factory(list)) correspondents = attr.ib(default=attr.Factory(list)) priority_claims = attr.ib(default=attr.Factory(list)) designated_states = attr.ib(default=attr.Factory(list)) references_cited = attr.ib(default=attr.Factory(list)) office_specific_bibdata = attr.ib(default=attr.Factory(dict)) events = attr.ib(default=attr.Factory(list)) pointer_publication_reference = JsonPointer('/dpma-patent-document/bibliographic-data/publication-references/publication-reference') pointer_application_reference = JsonPointer('/dpma-patent-document/bibliographic-data/application-reference') pointer_title = JsonPointer('/dpma-patent-document/bibliographic-data/invention-title') pointer_classifications_ipcr = JsonPointer('/dpma-patent-document/bibliographic-data/classifications-ipcr/classification-ipcr') pointer_pct_or_regional_publishing_data = JsonPointer('/dpma-patent-document/bibliographic-data/pct-or-regional-publishing-data') pointer_pct_or_regional_filing_data = JsonPointer('/dpma-patent-document/bibliographic-data/pct-or-regional-filing-data') pointer_applicants = JsonPointer('/dpma-patent-document/bibliographic-data/parties/applicants/applicant') pointer_inventors = JsonPointer('/dpma-patent-document/bibliographic-data/parties/inventors/inventor') pointer_agents = JsonPointer('/dpma-patent-document/bibliographic-data/parties/agents/agent') pointer_correspondents = JsonPointer('/dpma-patent-document/bibliographic-data/parties/correspondence-address') pointer_priority_claims = JsonPointer('/dpma-patent-document/bibliographic-data/priority-claims/priority-claim') pointer_designated_states = JsonPointer('/dpma-patent-document/bibliographic-data/designation-of-states/designation-pct/regional/country') pointer_references_cited = JsonPointer('/dpma-patent-document/bibliographic-data/references-cited/citation/patcit') pointer_office_specific_bibdata = JsonPointer('/dpma-patent-document/bibliographic-data/office-specific-bib-data') pointer_events = JsonPointer('/dpma-patent-document/events/event-data') def decode_badgerfish(self): self._data = BadgerFishNoNamespace(xml_fromstring=False, dict_type=OrderedDict).data(fromstring(self._xml)) return self def decode(self): # Convert from XML to data structure using the Badgerfish convention self.decode_badgerfish() # Document numbers self.application_reference = map( operator.itemgetter('document_id'), self.convert_list(self.query_data(self.pointer_application_reference))) self.publication_reference = map( operator.itemgetter('document_id'), self.convert_list(self.query_data(self.pointer_publication_reference))) # Classifications self.classifications['ipcr'] = self.convert_list(self.query_data(self.pointer_classifications_ipcr)) # pct-or-regional-{publishing,filing}-data self.pct_or_regional_data = { 'filing': self.convert_list(self.query_data(self.pointer_pct_or_regional_filing_data), 'document-id'), 'publishing': self.convert_list(self.query_data(self.pointer_pct_or_regional_publishing_data), 'document-id'), } # Decode title title = self.pointer_title.resolve(self._data) self.title = { 'lang': title['@lang'].lower(), 'text': title['$'], } # Parties: Applicants, inventors, agents and correspondence address self.applicants = self.decode_parties(self.pointer_applicants) self.inventors = self.decode_parties(self.pointer_inventors) self.agents = self.decode_parties(self.pointer_agents) self.correspondents = self.decode_parties(self.pointer_correspondents) # Priority claims self.priority_claims = self.convert_list(self.query_data(self.pointer_priority_claims)) # Designated states self.designated_states = self.convert_list(self.query_data(self.pointer_designated_states)) # Citations self.references_cited = map( operator.attrgetter('document_id.doc_number'), bunchify(self.convert_list(self.query_data(self.pointer_references_cited)))) # office-specific-bib-data self.office_specific_bibdata = self.convert_dict(self.query_data(self.pointer_office_specific_bibdata)) # Decode list of events events = self.convert_list(self.query_data(self.pointer_events)) self.events = sorted(events, key=operator.itemgetter('date_of_procedural_status')) return self def query_data(self, pointer): try: return pointer.resolve(self._data) except JsonPointerException: return @classmethod def convert_list(cls, things_raw, nested_element='$'): """Decode list of things""" things = [] for thing in to_list(things_raw): if not thing: continue if nested_element in thing and len(thing.keys()) == 1: thing = thing[nested_element] if isinstance(thing, dict): thing = cls.convert_dict(thing) things.append(thing) return things @classmethod def convert_dict(cls, data): """Decode data thing""" # Sanity checks if not data: return {} newdata = OrderedDict() for key, value in data.items(): # Decode nested text or recurse if '$' in value: value = value['$'] elif isinstance(value, dict): value = cls.convert_dict(value) # We want to have keys which are conveniently accessible as object attributes key = key.replace('-', '_') # Assign value newdata[key] = value return newdata def asdict(self): """Return dictionary of public instance attributes""" return attr.asdict(self, dict_factory=OrderedDict, filter=lambda attr, value: not attr.name.startswith('_')) def decode_parties(self, pointer): """ ST.36: Decode list of applicants, inventors or agents. See also https://github.com/Patent2net/P2N/blob/develop/p2n/ops/decoder.py#L535 """ try: nodes = to_list(pointer.resolve(self._data)) except JsonPointerException: return [] entries = [] for party in nodes: addressbook = party['addressbook'] entry = OrderedDict() entry['name'] = addressbook['name']['$'] entry['text'] = addressbook['text']['$'] entry['country'] = addressbook['address']['country']['$'] address = [] for index in range(1, 7): fieldname = 'address-{}'.format(index) try: value = addressbook['address'][fieldname]['$'] address.append(value) except KeyError: pass entry['address'] = address entries.append(entry) return entries
def from_key(self): from_ptr = JsonPointer(self.operation['from']) try: return int(from_ptr.parts[-1]) except TypeError: return from_ptr.parts[-1]
def analytics_family(query): payload = {} family_has_statistics = {} family_has_designated_states = {} # A. aggregate list of publication numbers # http://ops.epo.org/3.1/rest-services/published-data/search/full-cycle/?q=pa=%22MAMMUT%20SPORTS%20GROUP%20AG%22 # TODO: step through all pages response = ops_published_data_search('biblio', query, '1-50') pointer_results = JsonPointer('/ops:world-patent-data/ops:biblio-search/ops:search-result/exchange-documents') pointer_family_id = JsonPointer('/exchange-document/@family-id') pointer_publication_reference = JsonPointer('/exchange-document/bibliographic-data/publication-reference/document-id') # A.1 compute distinct list with unique families family_representatives = {} results = to_list(pointer_results.resolve(response)) for result in results: family_id = pointer_family_id.resolve(result) # TODO: currently, use first document as family representative; this could change if family_id not in family_representatives: document_id_entries = pointer_publication_reference.resolve(result) doc_number, date = _get_document_number_date(document_id_entries, 'epodoc') if doc_number: family_representatives[family_id] = doc_number # B. Enrich all family representatives # http://ops.epo.org/3.1/rest-services/family/application/docdb/US19288494.xml for family_id, document_number in family_representatives.iteritems(): payload.setdefault(family_id, {}) # B.1 Aggregate all family members try: family = ops_family_members(document_number) family_members = family.items payload[family_id]['family-members'] = family_members except Exception as ex: request = get_current_request() del request.errors[:] log.warn('Could not fetch OPS family for {0}'.format(document_number)) continue # B.2 Use first active priority for family_member_raw in family.raw: if 'priority-claim' not in payload[family_id]: for priority_claim in to_list(family_member_raw['priority-claim']): try: if priority_claim['priority-active-indicator']['$'] == 'YES': prio_number, prio_date = _get_document_number_date(priority_claim['document-id'], 'docdb') payload[family_id]['priority-claim'] = {'number-docdb': prio_number, 'date': prio_date} except KeyError: pass # B.3 Compute word- and image-counts for EP publication for statistics_country in ['EP', 'WO', 'AT', 'CA', 'CH', 'GB', 'ES']: if family_id in family_has_statistics: break for family_member in family_members: pubref_number = family_member['publication']['number-epodoc'] if pubref_number.startswith(statistics_country): statistics = {} # B.3.1 get data about claims try: claims_response = ops_claims(pubref_number) pointer_claims = JsonPointer('/ops:world-patent-data/ftxt:fulltext-documents/ftxt:fulltext-document/claims') claims = pointer_claims.resolve(claims_response) claim_paragraphs = [] for part in to_list(claims['claim']['claim-text']): claim_paragraphs.append(part['$']) claim_text = '\n'.join(claim_paragraphs) statistics['claims-language'] = claims['@lang'] statistics['claims-words-first'] = len(claim_paragraphs[0].split()) statistics['claims-words-total'] = len(claim_text.split()) statistics['claims-count'] = len(claim_paragraphs) except Exception as ex: request = get_current_request() del request.errors[:] log.warn('Could not fetch OPS claims for {0}'.format(pubref_number)) # B.3.2 get data about description try: description_response = ops_description(pubref_number) pointer_description = JsonPointer('/ops:world-patent-data/ftxt:fulltext-documents/ftxt:fulltext-document/description') descriptions = pointer_description.resolve(description_response) description_paragraphs = [] for part in to_list(descriptions['p']): description_paragraphs.append(part['$']) description_text = '\n'.join(description_paragraphs) statistics['description-words-total'] = len(description_text.split()) except Exception as ex: request = get_current_request() del request.errors[:] log.warn('Could not fetch OPS description for {0}'.format(pubref_number)) if statistics: # B.3.3 get data about image count try: pubref_number_docdb = family_member['publication']['number-docdb'] imginfo = inquire_images(pubref_number_docdb) statistics['drawings-count'] = imginfo['META']['drawing-total-count'] except Exception as ex: request = get_current_request() del request.errors[:] family_member['statistics'] = statistics family_has_statistics[family_id] = True break # B.4 compute designated states pointer_designated_states = JsonPointer('/ops:world-patent-data/ops:register-search/reg:register-documents/reg:register-document/reg:bibliographic-data/reg:designation-of-states') for country in ['EP', 'WO']: if family_id in family_has_designated_states: break for family_member in family_members: pubref_number = family_member['publication']['number-epodoc'] if pubref_number.startswith(country): try: reginfo_payload = ops_register('publication', pubref_number, 'biblio') except: request = get_current_request() del request.errors[:] log.warn('Could not fetch OPS register information for {0}'.format(pubref_number)) continue designated_states_list = pointer_designated_states.resolve(reginfo_payload) designated_states_info = to_list(designated_states_list)[0] try: regional_info = designated_states_info['reg:designation-pct']['reg:regional'] family_member.setdefault('register', {}) family_member['register']['designated-states'] = { 'gazette-num': designated_states_info['@change-gazette-num'], 'region': regional_info['reg:region']['reg:country']['$'], 'countries': list(_flatten_ops_json_list(regional_info['reg:country'])), } family_has_designated_states[family_id] = True break except Exception as ex: log.error('Retrieving designated states for {0} failed.'.format(pubref_number)) return payload
def results_swap_family_members(response): #pointer_results = JsonPointer('/ops:world-patent-data/ops:biblio-search/ops:search-result/ops:publication-reference') #entries = pointer_results.resolve(results) publication_numbers = [] # DE, EP..B, WO, EP..A2, EP..A3, EP, US priorities = [ {'filter': lambda patent: patent.country.startswith('DE') and not patent.kind.startswith('D1')}, {'filter': lambda patent: patent.country.startswith('EP') and patent.kind.startswith('B')}, {'filter': 'WO'}, {'filter': lambda patent: patent.country.startswith('EP') and patent.kind.startswith('A')}, {'filter': 'EP'}, {'filter': 'US'}, ] def match_filter(item, filter): if callable(filter): patent = split_patent_number(item) outcome = filter(patent) else: outcome = item.startswith(filter) return outcome pointer_results = JsonPointer('/ops:world-patent-data/ops:biblio-search/ops:search-result/exchange-documents') pointer_publication_reference = JsonPointer('/bibliographic-data/publication-reference/document-id') #pointer_publication_reference = JsonPointer('/exchange-document/bibliographic-data/publication-reference/document-id') # A.1 compute distinct list with unique families family_representatives = {} chunks = to_list(pointer_results.resolve(response)) all_results = [] for chunk in chunks: #print 'chunk:', chunk # Prepare list of document cycles #chunk_results = to_list(pointer_publication_reference.resolve(chunk)) cycles = to_list(chunk['exchange-document']) # Publication number of first cycle in EPODOC format representation = cycles[0] pubref = pointer_publication_reference.resolve(representation) representation_pubref_epodoc, _ = _get_document_number_date(pubref, 'epodoc') # All publication numbers in DOCDB format representation_pubrefs_docdb = [] for cycle in cycles: pubref = pointer_publication_reference.resolve(cycle) representation_pubref_docdb, _ = _get_document_number_date(pubref, 'docdb') representation_pubrefs_docdb.append(representation_pubref_docdb) # Debugging #print 'representation_pubref_epodoc:', representation_pubref_epodoc #print 'representation_pubrefs_docdb:', representation_pubrefs_docdb # Fetch family members. When failing, use first cycle as representation. try: family_info = ops_family_members(representation_pubref_epodoc) except: log.warning('Failed to fetch family information for %s', representation_pubref_epodoc) chunk['exchange-document'] = representation request = get_current_request() del request.errors[:] continue #members = family_info.publications_by_country() #pprint(members) # Find replacement from list of family members controlled by priority list. for prio in priorities: filter = prio['filter'] # Debugging #print 'checking prio:', filter if match_filter(representation_pubref_epodoc, filter): break bibdata = None found = False for member in family_info.items: # Debugging #print 'member:'; pprint(member) member_pubnum = member['publication']['number-docdb'] if match_filter(member_pubnum, filter): # Debugging #print 'Filter matched for member:', member_pubnum try: bibdata = ops_biblio_documents(member_pubnum) except: #log.warning('Fetching bibliographic data failed for %s', member_pubnum) request = get_current_request() del request.errors[:] continue #pprint(bibdata) if bibdata: # TODO: Add marker that this document was swapped, display appropriately. found = True break # Swap representation of document by appropriate family member # and set a marker in the data structure containing the original # document number(s). if found: representation = bibdata #print 'representation:'; pprint(representation) representation[0].setdefault('__meta__', {}) representation[0]['__meta__']['swapped'] = { 'canonical': representation_pubrefs_docdb[0], 'list': [representation_pubref_epodoc] + representation_pubrefs_docdb, } break # TODO: Here, duplicate documents might be. Prune/deduplicate them. # TODO: When choosing german family members (e.g. for EP666666), abstract is often missing. # TODO: => Carry along from original representation. """ for result in cycles: #pprint(result) pubref = pointer_publication_reference.resolve(result) #print entry, pubref pubref_number, pubref_date = _get_document_number_date(pubref, 'docdb') publication_numbers.append(pubref_number) """ chunk['exchange-document'] = representation # Filter duplicates seen = [] results = [] fields = ['@country', '@doc-number', '@kind', '@family-id'] for chunk in chunks: # Prepare list of document cycles. cycles = to_list(chunk['exchange-document']) # Only look at first cycle slot. doc = cycles[0] # Compute unique document identifier. ident = {} for key in fields: ident[key] = doc[key] # Collect chunk if not seen yet. if ident in seen: continue else: seen.append(ident) results.append(chunk) # Overwrite reduced list of chunks in original DOM. pointer_results.set(response, results) return publication_numbers
def from_key(self, value): from_ptr = JsonPointer(self.operation['from']) from_ptr.parts[-1] = str(value) self.operation['from'] = from_ptr.path
def ops_published_data_crawl(constituents, query, chunksize): if constituents != 'pub-number': raise ValueError('constituents "{0}" invalid or not implemented yet'.format(constituents)) real_constituents = constituents if constituents == 'pub-number': constituents = '' # fetch first chunk (1-chunksize) from upstream first_chunk = ops_published_data_search(constituents, query, '1-{0}'.format(chunksize)) #print first_chunk pointer_total_count = JsonPointer('/ops:world-patent-data/ops:biblio-search/@total-result-count') total_count = int(pointer_total_count.resolve(first_chunk)) log.info('ops_published_data_crawl total_count: %s', total_count) # The first 2000 hits are accessible from OPS. total_count = min(total_count, 2000) # collect upstream results begin_second_chunk = chunksize + 1 chunks = [first_chunk] for range_begin in range(begin_second_chunk, total_count + 1, chunksize): # countermeasure to robot flagging # <code>CLIENT.RobotDetected</code> # <message>Recent behaviour implies you are a robot. The server is at the moment busy to serve robots. Please try again later</message> time.sleep(5) range_end = range_begin + chunksize - 1 range_string = '{0}-{1}'.format(range_begin, range_end) log.info('ops_published_data_crawl range: ' + range_string) chunk = ops_published_data_search(constituents, query, range_string) #print 'chunk:', chunk chunks.append(chunk) #return chunks # merge chunks into single result """ <empty>: "ops:search-result" { » "ops:publication-reference": [ biblio: "ops:search-result" { » "exchange-documents": [ » "exchange-document": { abstract: "ops:search-result" { » "exchange-documents": [ » "exchange-document": { full-cycle: "ops:search-result" { » "exchange-documents": [ » "exchange-document": [ pub-number: "ops:search-result" { » "ops:publication-reference": [ { "@family-id": "6321653", "@system": "ops.epo.org", "document-id": { "@document-id-type": "docdb", "country": { "$": "DE" }, "doc-number": { "$": "3705908" }, "kind": { "$": "A1" } } }, """ pointer_results = JsonPointer('/ops:world-patent-data/ops:biblio-search/ops:search-result/ops:publication-reference') #pointer_time_elapsed = JsonPointer('/ops:world-patent-data/ops:meta/@value') all_results = [] #time_elapsed = int(pointer_time_elapsed.resolve(first_chunk)) for chunk in chunks: # FIXME: use this for "real_constituents == 'pub-number'" only chunk_results = to_list(pointer_results.resolve(chunk)) # FIXME: implement other constituents #print 'chunk_results:', chunk_results all_results += chunk_results #time_elapsed += int(pointer_time_elapsed.resolve(chunk)) response = None if real_constituents == 'pub-number': response = first_chunk # delete upstream data del resolve_pointer(response, '/ops:world-patent-data/ops:biblio-search/ops:search-result')['ops:publication-reference'] # compute own representation publication_numbers = [] pointer_document_id = JsonPointer('/document-id') for entry in all_results: pubref = pointer_document_id.resolve(entry) #print entry, pubref pubref_number, pubref_date = _get_document_number_date(pubref, 'docdb') publication_numbers.append(pubref_number) # add own representation set_pointer(response, '/ops:world-patent-data/ops:biblio-search/ops:search-result/publication-numbers', publication_numbers, inplace=True) # amend metadata new_total_count = str(len(publication_numbers)) pointer_total_count.set(response, new_total_count) set_pointer(response, '/ops:world-patent-data/ops:biblio-search/ops:range', {'@begin': '1', '@end': new_total_count}) #pointer_time_elapsed.set(response, str(time_elapsed)) if not response: raise ValueError('constituents "{0}" invalid or not implemented yet'.format(constituents)) return response