def results_swap_family_members(response): #pointer_results = JsonPointer('/ops:world-patent-data/ops:biblio-search/ops:search-result/ops:publication-reference') #entries = pointer_results.resolve(results) publication_numbers = [] # DE, EP..B, WO, EP..A2, EP..A3, EP, US priorities = [ {'filter': lambda patent: patent.country.startswith('DE') and not patent.kind.startswith('D1')}, {'filter': lambda patent: patent.country.startswith('EP') and patent.kind.startswith('B')}, {'filter': 'WO'}, {'filter': lambda patent: patent.country.startswith('EP') and patent.kind.startswith('A')}, {'filter': 'EP'}, {'filter': 'US'}, ] def match_filter(item, filter): if callable(filter): patent = split_patent_number(item) outcome = filter(patent) else: outcome = item.startswith(filter) return outcome pointer_results = JsonPointer('/ops:world-patent-data/ops:biblio-search/ops:search-result/exchange-documents') pointer_publication_reference = JsonPointer('/bibliographic-data/publication-reference/document-id') #pointer_publication_reference = JsonPointer('/exchange-document/bibliographic-data/publication-reference/document-id') # A.1 compute distinct list with unique families family_representatives = {} chunks = to_list(pointer_results.resolve(response)) all_results = [] for chunk in chunks: #print 'chunk:', chunk # Prepare list of document cycles #chunk_results = to_list(pointer_publication_reference.resolve(chunk)) cycles = to_list(chunk['exchange-document']) # Publication number of first cycle in EPODOC format representation = cycles[0] pubref = pointer_publication_reference.resolve(representation) representation_pubref_epodoc, _ = _get_document_number_date(pubref, 'epodoc') # All publication numbers in DOCDB format representation_pubrefs_docdb = [] for cycle in cycles: pubref = pointer_publication_reference.resolve(cycle) representation_pubref_docdb, _ = _get_document_number_date(pubref, 'docdb') representation_pubrefs_docdb.append(representation_pubref_docdb) # Debugging #print 'representation_pubref_epodoc:', representation_pubref_epodoc #print 'representation_pubrefs_docdb:', representation_pubrefs_docdb # Fetch family members. When failing, use first cycle as representation. try: family_info = ops_family_members(representation_pubref_epodoc) except: log.warning('Failed to fetch family information for %s', representation_pubref_epodoc) chunk['exchange-document'] = representation request = get_current_request() del request.errors[:] continue #members = family_info.publications_by_country() #pprint(members) # Find replacement from list of family members controlled by priority list. for prio in priorities: filter = prio['filter'] # Debugging #print 'checking prio:', filter if match_filter(representation_pubref_epodoc, filter): break bibdata = None found = False for member in family_info.items: # Debugging #print 'member:'; pprint(member) member_pubnum = member['publication']['number-docdb'] if match_filter(member_pubnum, filter): # Debugging #print 'Filter matched for member:', member_pubnum try: bibdata = ops_biblio_documents(member_pubnum) except: #log.warning('Fetching bibliographic data failed for %s', member_pubnum) request = get_current_request() del request.errors[:] continue #pprint(bibdata) if bibdata: # TODO: Add marker that this document was swapped, display appropriately. found = True break # Swap representation of document by appropriate family member # and set a marker in the data structure containing the original # document number(s). if found: representation = bibdata #print 'representation:'; pprint(representation) representation[0].setdefault('__meta__', {}) representation[0]['__meta__']['swapped'] = { 'canonical': representation_pubrefs_docdb[0], 'list': [representation_pubref_epodoc] + representation_pubrefs_docdb, } break # TODO: Here, duplicate documents might be. Prune/deduplicate them. # TODO: When choosing german family members (e.g. for EP666666), abstract is often missing. # TODO: => Carry along from original representation. """ for result in cycles: #pprint(result) pubref = pointer_publication_reference.resolve(result) #print entry, pubref pubref_number, pubref_date = _get_document_number_date(pubref, 'docdb') publication_numbers.append(pubref_number) """ chunk['exchange-document'] = representation # Filter duplicates seen = [] results = [] fields = ['@country', '@doc-number', '@kind', '@family-id'] for chunk in chunks: # Prepare list of document cycles. cycles = to_list(chunk['exchange-document']) # Only look at first cycle slot. doc = cycles[0] # Compute unique document identifier. ident = {} for key in fields: ident[key] = doc[key] # Collect chunk if not seen yet. if ident in seen: continue else: seen.append(ident) results.append(chunk) # Overwrite reduced list of chunks in original DOM. pointer_results.set(response, results) return publication_numbers
def createIndexData(offset, stepindex, valueindex): pointer = JsonPointer(offset) pointer.resolve(stepindex)['_offset'] = offset pointer.set(valueindex, {})
def ops_published_data_crawl(constituents, query, chunksize): if constituents != 'pub-number': raise ValueError('constituents "{0}" invalid or not implemented yet'.format(constituents)) real_constituents = constituents if constituents == 'pub-number': constituents = '' # fetch first chunk (1-chunksize) from upstream first_chunk = ops_published_data_search(constituents, query, '1-{0}'.format(chunksize)) #print first_chunk pointer_total_count = JsonPointer('/ops:world-patent-data/ops:biblio-search/@total-result-count') total_count = int(pointer_total_count.resolve(first_chunk)) log.info('ops_published_data_crawl total_count: %s', total_count) # The first 2000 hits are accessible from OPS. total_count = min(total_count, 2000) # collect upstream results begin_second_chunk = chunksize + 1 chunks = [first_chunk] for range_begin in range(begin_second_chunk, total_count + 1, chunksize): # countermeasure to robot flagging # <code>CLIENT.RobotDetected</code> # <message>Recent behaviour implies you are a robot. The server is at the moment busy to serve robots. Please try again later</message> time.sleep(5) range_end = range_begin + chunksize - 1 range_string = '{0}-{1}'.format(range_begin, range_end) log.info('ops_published_data_crawl range: ' + range_string) chunk = ops_published_data_search(constituents, query, range_string) #print 'chunk:', chunk chunks.append(chunk) #return chunks # merge chunks into single result """ <empty>: "ops:search-result" { » "ops:publication-reference": [ biblio: "ops:search-result" { » "exchange-documents": [ » "exchange-document": { abstract: "ops:search-result" { » "exchange-documents": [ » "exchange-document": { full-cycle: "ops:search-result" { » "exchange-documents": [ » "exchange-document": [ pub-number: "ops:search-result" { » "ops:publication-reference": [ { "@family-id": "6321653", "@system": "ops.epo.org", "document-id": { "@document-id-type": "docdb", "country": { "$": "DE" }, "doc-number": { "$": "3705908" }, "kind": { "$": "A1" } } }, """ pointer_results = JsonPointer('/ops:world-patent-data/ops:biblio-search/ops:search-result/ops:publication-reference') #pointer_time_elapsed = JsonPointer('/ops:world-patent-data/ops:meta/@value') all_results = [] #time_elapsed = int(pointer_time_elapsed.resolve(first_chunk)) for chunk in chunks: # FIXME: use this for "real_constituents == 'pub-number'" only chunk_results = to_list(pointer_results.resolve(chunk)) # FIXME: implement other constituents #print 'chunk_results:', chunk_results all_results += chunk_results #time_elapsed += int(pointer_time_elapsed.resolve(chunk)) response = None if real_constituents == 'pub-number': response = first_chunk # delete upstream data del resolve_pointer(response, '/ops:world-patent-data/ops:biblio-search/ops:search-result')['ops:publication-reference'] # compute own representation publication_numbers = [] pointer_document_id = JsonPointer('/document-id') for entry in all_results: pubref = pointer_document_id.resolve(entry) #print entry, pubref pubref_number, pubref_date = _get_document_number_date(pubref, 'docdb') publication_numbers.append(pubref_number) # add own representation set_pointer(response, '/ops:world-patent-data/ops:biblio-search/ops:search-result/publication-numbers', publication_numbers, inplace=True) # amend metadata new_total_count = str(len(publication_numbers)) pointer_total_count.set(response, new_total_count) set_pointer(response, '/ops:world-patent-data/ops:biblio-search/ops:range', {'@begin': '1', '@end': new_total_count}) #pointer_time_elapsed.set(response, str(time_elapsed)) if not response: raise ValueError('constituents "{0}" invalid or not implemented yet'.format(constituents)) return response
def createIndexData(offset, stepindex, valueindex): pointer = JsonPointer(offset) pointer.resolve(stepindex)['_offset'] = offset pointer.set(valueindex, {})