def second_level_topics(): second_level_topics = [] for topic in top_level_topics: second_level_topic_data = parser.parse_url(topic["api_url"]) for second_level_topic in second_level_topic_data["links"]["children"]: second_level_topics.append(second_level_topic) return second_level_topics
"specialist_sectors": ["array of items"], "title": "the titles", "document_type": "document type" } Then go through all the links with https://www.gov.uk/api/content/#{link} and collect content ids ''' from api_paginator import ApiPaginator import page_parser as parser ROOT_URL_OF_ITEMS_WITHOUT_TAG = "https://www.gov.uk/api/search.json?filter_specialist_sectors=_MISSING" ROOT_URL_OF_ITEMS_WITH_TAG = "https://www.gov.uk/api/search.json?reject_specialist_sectors=_MISSING" parsed_root = parser.parse_url(ROOT_URL_OF_ITEMS_WITH_TAG) paginator = ApiPaginator(ROOT_URL_OF_ITEMS_WITH_TAG) # count items, I don't wanna store this variable here, # so this method should set an internal variable rather than returning a value item_count = paginator.items_total(parsed_root) # tell paginator to count items, this should be done by the pagintaor itself # move this within another method in the paginator paginator.calculate_pages(item_count) page_urls = paginator.page_urls() import json fo = open('data/items_with_tag.jsonl', "w+") for page_url in page_urls:
''' This script parses first and second level topic's data from GOV.UK's API. Current output: some logging and the total number of first and second level topics. ''' import page_parser as parser ROOT_URL = "https://www.gov.uk/api/content/topic" def second_level_topics(): second_level_topics = [] for topic in top_level_topics: second_level_topic_data = parser.parse_url(topic["api_url"]) for second_level_topic in second_level_topic_data["links"]["children"]: second_level_topics.append(second_level_topic) return second_level_topics top_level_topics = parser.parse_url(ROOT_URL)["links"]["children"] second_level_topics = second_level_topics() print("\nTotal number of first level topics: %d" % len(top_level_topics)) print("\nTotal number of second level topics: %d" % len(second_level_topics))
redirects and the content is available on the regular site ''' import page_parser as parser import json import time ROOT_URL = "https://www.gov.uk/api/content" # this is bad because it stores the whole file in a variable lines = [line.rstrip('\n') for line in open('items_with_tag.jsonl')] fo = open('items_with_tag_and_content_id.jsonl', "w+") for line in lines: try: json_line = json.loads(line) if 'content_id' not in json_line: link = json_line['link'] time.sleep(1) parsed_item = parser.parse_url(ROOT_URL + link) item_content_id = parsed_item['content_id'] json_line['content_id'] = item_content_id fo.write( json.dumps(json_line) + "\n" ) print("processed: " + item_content_id) else: print("key present in:" + line[:50]) except: fo.write( item_content_id + "\n" ) print("failed: " + item_content_id) fo.close()