def ops_family_members(document_number): pointer_results = JsonPointer('/ops:world-patent-data/ops:patent-family/ops:family-member') pointer_publication_reference = JsonPointer('/publication-reference/document-id') pointer_application_reference = JsonPointer('/application-reference/document-id') #pointer_priority_claim_reference = JsonPointer('/priority-claim/document-id') response = ops_family_inpadoc('publication', document_number, '') family_members = OPSFamilyMembers() family_members.raw = to_list(pointer_results.resolve(response)) for result in family_members.raw: # B.1 get publication and application references pubref = pointer_publication_reference.resolve(result) pubref_number, pubref_date = _get_document_number_date(pubref, 'docdb') pubref_number_epodoc, pubref_date_epodoc = _get_document_number_date(pubref, 'epodoc') appref = pointer_application_reference.resolve(result) appref_number, appref_date = _get_document_number_date(appref, 'docdb') family_members.items.append({ 'publication': {'number-docdb': pubref_number, 'date': pubref_date, 'number-epodoc': pubref_number_epodoc, }, 'application': {'number-docdb': appref_number, 'date': appref_date}, }) #log.info('Family members for %s:\n%s', document_number, family_members) return family_members
def read(self): pointer_results = JsonPointer( '/ops:world-patent-data/ops:patent-family/ops:family-member') family_members = to_list(pointer_results.resolve(self.data)) for family_member in family_members: # Decode document number publication_number = 'unknown' try: document_id = JsonPointer('/publication-reference/document-id') publication_number, publication_date = OPSExchangeDocumentDecoder.document_number_date( document_id.resolve(family_member), 'epodoc') except JsonPointerException: pass # Read bibliographic data for family member item = OPSExchangeDocument() try: item.read(family_member) self.results.append(item) except JsonPointerException as ex: if "member 'exchange-document' not found" in ex.message: logger.debug( 'No bibliographic data for family member "{}"'.format( publication_number)) else: raise
def createOffsetMeta(offset, bookkeeping): """ sets up a location to track rule and step ids for a given scope offset """ pointer = JsonPointer(offset) view = bookkeeping for x in pointer.parts: view = view.setdefault(x, {}) pointer.resolve(bookkeeping).setdefault("_meta", {"stages": [], "steps": []})
def add(self, name, value, *ignored_args, **ignored_kwargs): """ Adds a new JSON Pointer expression to the store. """ # Make sure it's valid, no exception in 'resolve' means the expression was valid. pointer = JsonPointer(value) pointer.resolve({}, None) with self.update_lock: self.data[name] = pointer
def add(self, name, config, *ignored_args, **ignored_kwargs): """ Adds a new JSON Pointer expression to the store. """ # Make sure it's valid, no exception in 'resolve' means the expression was valid. pointer = JsonPointer(config.value) pointer.resolve({}, None) with self.update_lock: self.data[name] = pointer
def createOffsetMeta(offset, bookkeeping): ''' sets up a location to track rule and step ids for a given scope offset ''' pointer = JsonPointer(offset) view = bookkeeping for x in pointer.parts: view = view.setdefault(x,{}) pointer.resolve(bookkeeping).setdefault('_meta',{'stages': [], 'steps': []})
def createOffsetMeta(offset, bookkeeping): ''' sets up a location to track rule and step ids for a given scope offset ''' pointer = JsonPointer(offset) view = bookkeeping for x in pointer.parts: view = view.setdefault(x, {}) pointer.resolve(bookkeeping).setdefault('_meta', { 'stages': [], 'steps': [] })
def addRule(self, rule, offset='', identifier = None): ''' add a DAG extension rule, possibly with a scope offset ''' thisoffset = JsonPointer(offset) if offset != '': createIndexData(thisoffset.path, self.steps, self.values) createOffsetMeta(thisoffset.path, self.bookkeeper) offsetstage = OffsetStage(rule, self._makeoffset(offset), identifier = identifier) self.rules += [offsetstage] thisoffset.resolve(self.bookkeeper)['_meta']['stages'] += [offsetstage.identifier] return offsetstage.identifier
def addRule(self, rule, offset="", identifier=None): """ add a DAG extension rule, possibly with a scope offset """ thisoffset = JsonPointer(offset) if offset != "": createIndexData(thisoffset.path, self.steps, self.values) createOffsetMeta(thisoffset.path, self.bookkeeper) offsetstage = OffsetStage(rule, self._makeoffset(offset), identifier=identifier) self.rules += [offsetstage] thisoffset.resolve(self.bookkeeper)["_meta"]["stages"] += [ offsetstage.identifier ] return offsetstage.identifier
def _resolve_mappings(body): # from: https://stackoverflow.com/a/39016088 def item_generator(json_input, lookup_key): if isinstance(json_input, dict): for k, v in list(json_input.items()): if k == lookup_key: yield json_input else: yield from item_generator(v, lookup_key) elif isinstance(json_input, list): for item in json_input: yield from item_generator(item, lookup_key) for el in item_generator(body, 'type'): if '#' in el['type']: # referenced type - split it into filename and jsonpointer el_type = el['type'].split('#', maxsplit=1) # get mapping from loaded mappings mapping = included_mappings(el_type[0]) ptr = JsonPointer(el_type[1]) included_mapping_type = ptr.resolve(mapping) _resolve_mappings(included_mapping_type) merged = _merge_dicts(el, included_mapping_type) el.update(merged) for el in item_generator(body, 'allOf'): if 'properties' in el: props = {} for el_type in el['allOf']: props.update(el_type.get('properties', {})) el['properties'].update(props) el.pop('allOf')
def _result_list_compact(response): items = [] pointer_results = JsonPointer('/ops:world-patent-data/ops:biblio-search/ops:search-result/exchange-documents') pointer_application_reference = JsonPointer('/exchange-document/bibliographic-data/application-reference/document-id') pointer_publication_reference = JsonPointer('/exchange-document/bibliographic-data/publication-reference/document-id') pointer_invention_title = JsonPointer('/exchange-document/bibliographic-data/invention-title') pointer_abstract = JsonPointer('/exchange-document/abstract') pointer_applicant = JsonPointer('/exchange-document/bibliographic-data/parties/applicants/applicant') pointer_inventor = JsonPointer('/exchange-document/bibliographic-data/parties/inventors/inventor') results = to_list(pointer_results.resolve(response)) for result in results: pubref = pointer_publication_reference.resolve(result) pubref_number, pubref_date = _get_document_number_date(pubref, 'epodoc') pubref_date = pubref_date and '-'.join([pubref_date[:4], pubref_date[4:6], pubref_date[6:8]]) appref = pointer_application_reference.resolve(result) appref_number, appref_date = _get_document_number_date(appref, 'epodoc') appref_date = appref_date and '-'.join([appref_date[:4], appref_date[4:6], appref_date[6:8]]) try: titles = to_list(pointer_invention_title.resolve(result)) titles = map(_format_title, titles) except JsonPointerException: titles = None try: abstracts = to_list(pointer_abstract.resolve(result)) abstracts = map(_format_abstract, abstracts) except JsonPointerException: abstracts = None try: applicants = to_list(pointer_applicant.resolve(result)) applicants = _mogrify_parties(applicants, 'applicant-name') except JsonPointerException: applicants = None try: inventors = to_list(pointer_inventor.resolve(result)) inventors = _mogrify_parties(inventors, 'inventor-name') except JsonPointerException: inventors = None item = { 'abstract': abstracts, 'appdate': appref_date, 'appnumber': appref_number, 'pubdate': pubref_date, 'pubnumber': pubref_number, 'title': titles, 'applicant': applicants, 'inventor': inventors, } items.append(item) return items
def query_ops(query, limit=50): #print 'query:', query response = ops_published_data_search('biblio', query, '1-{0}'.format(limit)) #print response pointer_total_count = JsonPointer('/ops:world-patent-data/ops:biblio-search/@total-result-count') total_count = int(pointer_total_count.resolve(response)) log.info('query: %s, total_count: %s', query, total_count) return response, total_count
def resolve(self, doc, default=jsonpointer._nothing): if self.isHash: if len(self.parts) == 1: refdata = doc else: p = JsonPointer('/' + '/'.join(self.parts[:-1])) refdata = p.resolve(doc) if isinstance(refdata, list): return int(self.parts[-1]) else: return self.parts[-1] else: return super(RelJsonPointer, self).resolve(doc, default)
def read(self): pointer_results = JsonPointer( '/ops:world-patent-data/ops:register-search/reg:register-documents' ) register_documents = to_list(pointer_results.resolve(self.data)) for register_document in register_documents: item = OPSRegisterDocument() try: item.read(register_document) self.results.append(item) except JsonPointerException as ex: logger.warning( 'Could not read register information from data "{}": {}\n{}' .format(register_document, ex, exception_traceback()))
def ops_published_data_search_real(constituents, query, range): # OPS client object, impersonated for the current user. ops = get_ops_client() # Send request to OPS. range_begin, range_end = map(int, range.split('-')) response = ops.published_data_search( query, range_begin=range_begin, range_end=range_end, constituents=to_list(constituents)) # Decode OPS response from JSON payload = handle_response(response, 'ops-search') if response.headers['content-type'].startswith('application/json'): # Decode total number of results. pointer_total_count = JsonPointer('/ops:world-patent-data/ops:biblio-search/@total-result-count') count_total = int(pointer_total_count.resolve(payload)) # Raise an exception to skip caching empty results. if count_total == 0: raise NoResultsException('No results', data=payload) return payload
def analytics_family(query): payload = {} family_has_statistics = {} family_has_designated_states = {} # A. aggregate list of publication numbers # http://ops.epo.org/3.1/rest-services/published-data/search/full-cycle/?q=pa=%22MAMMUT%20SPORTS%20GROUP%20AG%22 # TODO: step through all pages response = ops_published_data_search('biblio', query, '1-50') pointer_results = JsonPointer('/ops:world-patent-data/ops:biblio-search/ops:search-result/exchange-documents') pointer_family_id = JsonPointer('/exchange-document/@family-id') pointer_publication_reference = JsonPointer('/exchange-document/bibliographic-data/publication-reference/document-id') # A.1 compute distinct list with unique families family_representatives = {} results = to_list(pointer_results.resolve(response)) for result in results: family_id = pointer_family_id.resolve(result) # TODO: currently, use first document as family representative; this could change if family_id not in family_representatives: document_id_entries = pointer_publication_reference.resolve(result) doc_number, date = _get_document_number_date(document_id_entries, 'epodoc') if doc_number: family_representatives[family_id] = doc_number # B. Enrich all family representatives # http://ops.epo.org/3.1/rest-services/family/application/docdb/US19288494.xml for family_id, document_number in family_representatives.iteritems(): payload.setdefault(family_id, {}) # B.1 Aggregate all family members try: family = ops_family_members(document_number) family_members = family.items payload[family_id]['family-members'] = family_members except Exception as ex: request = get_current_request() del request.errors[:] log.warn('Could not fetch OPS family for {0}'.format(document_number)) continue # B.2 Use first active priority for family_member_raw in family.raw: if 'priority-claim' not in payload[family_id]: for priority_claim in to_list(family_member_raw['priority-claim']): try: if priority_claim['priority-active-indicator']['$'] == 'YES': prio_number, prio_date = _get_document_number_date(priority_claim['document-id'], 'docdb') payload[family_id]['priority-claim'] = {'number-docdb': prio_number, 'date': prio_date} except KeyError: pass # B.3 Compute word- and image-counts for EP publication for statistics_country in ['EP', 'WO', 'AT', 'CA', 'CH', 'GB', 'ES']: if family_id in family_has_statistics: break for family_member in family_members: pubref_number = family_member['publication']['number-epodoc'] if pubref_number.startswith(statistics_country): statistics = {} # B.3.1 get data about claims try: claims_response = ops_claims(pubref_number) pointer_claims = JsonPointer('/ops:world-patent-data/ftxt:fulltext-documents/ftxt:fulltext-document/claims') claims = pointer_claims.resolve(claims_response) claim_paragraphs = [] for part in to_list(claims['claim']['claim-text']): claim_paragraphs.append(part['$']) claim_text = '\n'.join(claim_paragraphs) statistics['claims-language'] = claims['@lang'] statistics['claims-words-first'] = len(claim_paragraphs[0].split()) statistics['claims-words-total'] = len(claim_text.split()) statistics['claims-count'] = len(claim_paragraphs) except Exception as ex: request = get_current_request() del request.errors[:] log.warn('Could not fetch OPS claims for {0}'.format(pubref_number)) # B.3.2 get data about description try: description_response = ops_description(pubref_number) pointer_description = JsonPointer('/ops:world-patent-data/ftxt:fulltext-documents/ftxt:fulltext-document/description') descriptions = pointer_description.resolve(description_response) description_paragraphs = [] for part in to_list(descriptions['p']): description_paragraphs.append(part['$']) description_text = '\n'.join(description_paragraphs) statistics['description-words-total'] = len(description_text.split()) except Exception as ex: request = get_current_request() del request.errors[:] log.warn('Could not fetch OPS description for {0}'.format(pubref_number)) if statistics: # B.3.3 get data about image count try: pubref_number_docdb = family_member['publication']['number-docdb'] imginfo = inquire_images(pubref_number_docdb) statistics['drawings-count'] = imginfo['META']['drawing-total-count'] except Exception as ex: request = get_current_request() del request.errors[:] family_member['statistics'] = statistics family_has_statistics[family_id] = True break # B.4 compute designated states pointer_designated_states = JsonPointer('/ops:world-patent-data/ops:register-search/reg:register-documents/reg:register-document/reg:bibliographic-data/reg:designation-of-states') for country in ['EP', 'WO']: if family_id in family_has_designated_states: break for family_member in family_members: pubref_number = family_member['publication']['number-epodoc'] if pubref_number.startswith(country): try: reginfo_payload = ops_register('publication', pubref_number, 'biblio') except: request = get_current_request() del request.errors[:] log.warn('Could not fetch OPS register information for {0}'.format(pubref_number)) continue designated_states_list = pointer_designated_states.resolve(reginfo_payload) designated_states_info = to_list(designated_states_list)[0] try: regional_info = designated_states_info['reg:designation-pct']['reg:regional'] family_member.setdefault('register', {}) family_member['register']['designated-states'] = { 'gazette-num': designated_states_info['@change-gazette-num'], 'region': regional_info['reg:region']['reg:country']['$'], 'countries': list(_flatten_ops_json_list(regional_info['reg:country'])), } family_has_designated_states[family_id] = True break except Exception as ex: log.error('Retrieving designated states for {0} failed.'.format(pubref_number)) return payload
def results_swap_family_members(response): #pointer_results = JsonPointer('/ops:world-patent-data/ops:biblio-search/ops:search-result/ops:publication-reference') #entries = pointer_results.resolve(results) publication_numbers = [] # DE, EP..B, WO, EP..A2, EP..A3, EP, US priorities = [ {'filter': lambda patent: patent.country.startswith('DE') and not patent.kind.startswith('D1')}, {'filter': lambda patent: patent.country.startswith('EP') and patent.kind.startswith('B')}, {'filter': 'WO'}, {'filter': lambda patent: patent.country.startswith('EP') and patent.kind.startswith('A')}, {'filter': 'EP'}, {'filter': 'US'}, ] def match_filter(item, filter): if callable(filter): patent = split_patent_number(item) outcome = filter(patent) else: outcome = item.startswith(filter) return outcome pointer_results = JsonPointer('/ops:world-patent-data/ops:biblio-search/ops:search-result/exchange-documents') pointer_publication_reference = JsonPointer('/bibliographic-data/publication-reference/document-id') #pointer_publication_reference = JsonPointer('/exchange-document/bibliographic-data/publication-reference/document-id') # A.1 compute distinct list with unique families family_representatives = {} chunks = to_list(pointer_results.resolve(response)) all_results = [] for chunk in chunks: #print 'chunk:', chunk # Prepare list of document cycles #chunk_results = to_list(pointer_publication_reference.resolve(chunk)) cycles = to_list(chunk['exchange-document']) # Publication number of first cycle in EPODOC format representation = cycles[0] pubref = pointer_publication_reference.resolve(representation) representation_pubref_epodoc, _ = _get_document_number_date(pubref, 'epodoc') # All publication numbers in DOCDB format representation_pubrefs_docdb = [] for cycle in cycles: pubref = pointer_publication_reference.resolve(cycle) representation_pubref_docdb, _ = _get_document_number_date(pubref, 'docdb') representation_pubrefs_docdb.append(representation_pubref_docdb) # Debugging #print 'representation_pubref_epodoc:', representation_pubref_epodoc #print 'representation_pubrefs_docdb:', representation_pubrefs_docdb # Fetch family members. When failing, use first cycle as representation. try: family_info = ops_family_members(representation_pubref_epodoc) except: log.warning('Failed to fetch family information for %s', representation_pubref_epodoc) chunk['exchange-document'] = representation request = get_current_request() del request.errors[:] continue #members = family_info.publications_by_country() #pprint(members) # Find replacement from list of family members controlled by priority list. for prio in priorities: filter = prio['filter'] # Debugging #print 'checking prio:', filter if match_filter(representation_pubref_epodoc, filter): break bibdata = None found = False for member in family_info.items: # Debugging #print 'member:'; pprint(member) member_pubnum = member['publication']['number-docdb'] if match_filter(member_pubnum, filter): # Debugging #print 'Filter matched for member:', member_pubnum try: bibdata = ops_biblio_documents(member_pubnum) except: #log.warning('Fetching bibliographic data failed for %s', member_pubnum) request = get_current_request() del request.errors[:] continue #pprint(bibdata) if bibdata: # TODO: Add marker that this document was swapped, display appropriately. found = True break # Swap representation of document by appropriate family member # and set a marker in the data structure containing the original # document number(s). if found: representation = bibdata #print 'representation:'; pprint(representation) representation[0].setdefault('__meta__', {}) representation[0]['__meta__']['swapped'] = { 'canonical': representation_pubrefs_docdb[0], 'list': [representation_pubref_epodoc] + representation_pubrefs_docdb, } break # TODO: Here, duplicate documents might be. Prune/deduplicate them. # TODO: When choosing german family members (e.g. for EP666666), abstract is often missing. # TODO: => Carry along from original representation. """ for result in cycles: #pprint(result) pubref = pointer_publication_reference.resolve(result) #print entry, pubref pubref_number, pubref_date = _get_document_number_date(pubref, 'docdb') publication_numbers.append(pubref_number) """ chunk['exchange-document'] = representation # Filter duplicates seen = [] results = [] fields = ['@country', '@doc-number', '@kind', '@family-id'] for chunk in chunks: # Prepare list of document cycles. cycles = to_list(chunk['exchange-document']) # Only look at first cycle slot. doc = cycles[0] # Compute unique document identifier. ident = {} for key in fields: ident[key] = doc[key] # Collect chunk if not seen yet. if ident in seen: continue else: seen.append(ident) results.append(chunk) # Overwrite reduced list of chunks in original DOM. pointer_results.set(response, results) return publication_numbers
def ops_published_data_crawl(constituents, query, chunksize): if constituents != 'pub-number': raise ValueError('constituents "{0}" invalid or not implemented yet'.format(constituents)) real_constituents = constituents if constituents == 'pub-number': constituents = '' # fetch first chunk (1-chunksize) from upstream first_chunk = ops_published_data_search(constituents, query, '1-{0}'.format(chunksize)) #print first_chunk pointer_total_count = JsonPointer('/ops:world-patent-data/ops:biblio-search/@total-result-count') total_count = int(pointer_total_count.resolve(first_chunk)) log.info('ops_published_data_crawl total_count: %s', total_count) # The first 2000 hits are accessible from OPS. total_count = min(total_count, 2000) # collect upstream results begin_second_chunk = chunksize + 1 chunks = [first_chunk] for range_begin in range(begin_second_chunk, total_count + 1, chunksize): # countermeasure to robot flagging # <code>CLIENT.RobotDetected</code> # <message>Recent behaviour implies you are a robot. The server is at the moment busy to serve robots. Please try again later</message> time.sleep(5) range_end = range_begin + chunksize - 1 range_string = '{0}-{1}'.format(range_begin, range_end) log.info('ops_published_data_crawl range: ' + range_string) chunk = ops_published_data_search(constituents, query, range_string) #print 'chunk:', chunk chunks.append(chunk) #return chunks # merge chunks into single result """ <empty>: "ops:search-result" { » "ops:publication-reference": [ biblio: "ops:search-result" { » "exchange-documents": [ » "exchange-document": { abstract: "ops:search-result" { » "exchange-documents": [ » "exchange-document": { full-cycle: "ops:search-result" { » "exchange-documents": [ » "exchange-document": [ pub-number: "ops:search-result" { » "ops:publication-reference": [ { "@family-id": "6321653", "@system": "ops.epo.org", "document-id": { "@document-id-type": "docdb", "country": { "$": "DE" }, "doc-number": { "$": "3705908" }, "kind": { "$": "A1" } } }, """ pointer_results = JsonPointer('/ops:world-patent-data/ops:biblio-search/ops:search-result/ops:publication-reference') #pointer_time_elapsed = JsonPointer('/ops:world-patent-data/ops:meta/@value') all_results = [] #time_elapsed = int(pointer_time_elapsed.resolve(first_chunk)) for chunk in chunks: # FIXME: use this for "real_constituents == 'pub-number'" only chunk_results = to_list(pointer_results.resolve(chunk)) # FIXME: implement other constituents #print 'chunk_results:', chunk_results all_results += chunk_results #time_elapsed += int(pointer_time_elapsed.resolve(chunk)) response = None if real_constituents == 'pub-number': response = first_chunk # delete upstream data del resolve_pointer(response, '/ops:world-patent-data/ops:biblio-search/ops:search-result')['ops:publication-reference'] # compute own representation publication_numbers = [] pointer_document_id = JsonPointer('/document-id') for entry in all_results: pubref = pointer_document_id.resolve(entry) #print entry, pubref pubref_number, pubref_date = _get_document_number_date(pubref, 'docdb') publication_numbers.append(pubref_number) # add own representation set_pointer(response, '/ops:world-patent-data/ops:biblio-search/ops:search-result/publication-numbers', publication_numbers, inplace=True) # amend metadata new_total_count = str(len(publication_numbers)) pointer_total_count.set(response, new_total_count) set_pointer(response, '/ops:world-patent-data/ops:biblio-search/ops:range', {'@begin': '1', '@end': new_total_count}) #pointer_time_elapsed.set(response, str(time_elapsed)) if not response: raise ValueError('constituents "{0}" invalid or not implemented yet'.format(constituents)) return response
def createIndexData(offset, stepindex, valueindex): pointer = JsonPointer(offset) pointer.resolve(stepindex)['_offset'] = offset pointer.set(valueindex, {})
def decode_countries(node, pointer): countries_pointer = JsonPointer(pointer) countries_raw = to_list(countries_pointer.resolve(node)) countries = [country_raw['$'] for country_raw in countries_raw] return countries
def __init__(self, directive, arguments, options, content, lineno, content_offset, block_text, state, state_machine): assert directive == 'jsonschema' self.options = options self.state = state self.lineno = lineno self.statemachine = state_machine if len(arguments) == 1: filename, pointer = self._splitpointer(arguments[0]) if filename != '': self._load_external(filename) else: self._load_internal(content) if pointer: self.schema = resolve_pointer(self.schema, pointer) else: self._load_internal(content) hidden_paths = self.options.get('hide') if hidden_paths is not None: orig_schema = json.loads(json.dumps(self.schema)) for hidden_path in hidden_paths.split(' '): ptr = JsonPointer(hidden_path) parent, name = ptr.to_last(self.schema) del parent[name] shown_paths = self.options.get('show') for shown_path in shown_paths.split(' '): ptr = JsonPointer(shown_path) orig_parent = orig_schema current_parent = self.schema for part in ptr.parts[:-1]: orig_parent = ptr.walk(orig_parent, part) try: current_parent = ptr.walk(current_parent, part) except JsonPointerException: if isinstance(orig_parent, Sequence): new_entry = [] elif isinstance(orig_parent, Mapping): new_entry = OrderedDict() else: raise Exception('Unsupported type parent') if isinstance(current_parent, MutableSequence): current_parent.append(new_entry) elif isinstance(current_parent, MutableMapping): current_parent[part] = new_entry current_parent = new_entry if isinstance(current_parent, MutableSequence): current_parent.append(ptr.resolve(orig_schema)) elif isinstance(current_parent, MutableMapping): current_parent[ptr.parts[-1]] = ptr.resolve(orig_schema) else: raise Exception('Unsupported type parent')
def test_json_pointer_on_dict(): ref_dict = RefDict("base/reflist.json#/") pointer = JsonPointer("/definitions/foo/not/0") assert pointer.resolve(ref_dict) == {"type": "object"}