def parse(res, container, dataset) -> dict: """ function parses content to create a dataset model """ # add resources from the 'container' to the dataset page_resource_links = container.find_all(name='option', value=base_parser.resource_checker, recursive=True) for resource_link in page_resource_links: resource = Resource(source_url=res.url, url=resource_link['value']) # get the resource name resource['name'] = str(resource_link.text).strip() # get the format of the resource from the file extension of the link resource_format = resource_link['value']\ [resource_link['value'].rfind('.') + 1:] resource['format'] = resource_format # Add header information to resource object resource['headers'] = h.get_resource_headers(res.url, resource_link['value']) # add the resource to collection of resources dataset['resources'].append(resource) if len(dataset['resources']) == 0: # if no resources were found return None yield dataset
def parse(res) -> dict: """ function parses content to create a dataset model """ # create parser object soup_parser = bs4.BeautifulSoup(res.text, 'html5lib') dataset_containers = soup_parser.body.find_all(class_='accordiontitle', recursive=True) for container in dataset_containers: # create dataset model dict dataset = Dataset() dataset['source_url'] = res.url dataset['title'] = str(container.find(class_='accordionheader').\ string).strip() if dataset['title'] is None or dataset['title'] == '': dataset['title'] = str(soup_parser.head.\ find(name='title').string).strip() # replace all non-word characters (e.g. ?/) with '-' dataset['name'] = slugify(dataset['title']) # get publisher from parent package name dataset['publisher'] = dataset['publisher'] = __package__.split('.')[-2] dataset['notes'] = str(container.find(name='p').string).\ strip() # if no notes/description available, default to dataset title if dataset['notes'] is None or dataset['notes'] == '': dataset['notes'] = dataset['title'] dataset['tags'] = '' dataset['date'] = '' dataset['contact_person_name'] = "" dataset['contact_person_email'] = "" dataset['resources'] = list() # add resources from the 'container' to the dataset page_resource_links = container.find_all(name='a', href=base_parser.resource_checker, recursive=True) for resource_link in page_resource_links: resource = Resource(source_url = res.url, url = resource_link['href'], name = str(resource_link.string).strip()) # get the format of the resource from the file extension of the link resource_format = resource_link['href']\ [resource_link['href'].rfind('.') + 1:] resource['format'] = resource_format # Add header information to resource object resource['headers'] = h.get_resource_headers(res.url, resource_link['href']) # add the resource to collection of resources dataset['resources'].append(resource) # if this dataset alread has data resource files look for # document resource files if len(dataset['resources']) > 0: yield dataset
def parse(res, container, dataset) -> dict: """ function parses content to create a dataset model """ # add resources from the 'container' to the dataset page_resource_links = container.find_all(name='a', href=base_parser.resource_checker, recursive=True) for resource_link in page_resource_links: resource = Resource(source_url=res.url, url=resource_link['href']) # get the resource name iteratively for child in resource_link.parent.children: if resource_link.parent.name == 'td': resource['name'] = str( resource_link.find_parent(name='tr').contents[1]).strip() else: resource['name'] = str(child).strip() if re.sub(r'(<.+>)', '', re.sub(r'(</.+>)', '', resource['name'])) != "": break resource['name'] = re.sub(r'(</.+>)', '', resource['name']) resource['name'] = re.sub(r'(<.+>)', '', resource['name']) if resource_link.parent.parent.find(name=True): # concatenate the text content of parents with # class 'headersLevel1' & 'headersLevel2' resource['description'] = str(resource_link.parent.parent.find(name=True)).strip() +\ " - " + str(resource['name']).strip() resource['description'] = re.sub(r'(</.+>)', '', resource['description']) resource['description'] = re.sub(r'(<.+>)', '', resource['description']) else: # concatenate the text content of parents with # class 'headersLevel1' & 'contentText' resource['description'] = str(resource['name']).strip() resource['description'] = re.sub(r'(</.+>)', '', resource['description']) resource['description'] = re.sub(r'(<.+>)', '', resource['description']) # get the format of the resource from the file extension of the link resource_format = resource_link['href']\ [resource_link['href'].rfind('.') + 1:] resource['format'] = resource_format # Add header information to resource object resource['headers'] = h.get_resource_headers(res.url, resource_link['href']) # add the resource to collection of resources dataset['resources'].append(resource) if len(dataset['resources']) == 0: # if no resources were found return None yield dataset
def get_all_resources(res, dataset, extensions, deny_list=[]): for link in LxmlLinkExtractor(deny_extensions=[], deny=deny_list).extract_links(res): for extension in extensions.keys(): if link.url.endswith(extension): resource = Resource( source_url=res.url, url=link.url, name=link.text, ) dataset['resources'].append(resource)
def parse(res, publisher): """ function parses content to create a dataset model """ # ensure that the response text gotten is a string if not isinstance(getattr(res, 'text', None), str): return None # create parser object try: soup_parser = bs4.BeautifulSoup(res.text, 'html5lib') except: return None dataset_containers = soup_parser.body.find_all(class_='contentText', recursive=True) # check if this page is a collection (i.e. collection of datasets) if len(dataset_containers) > 0: # this is a collection # create the collection (with a source) collection = h.extract_dataset_collection_from_url(collection_url=res.url, namespace="all", source_url=\ str(res.request.headers.get(str(b'Referer', encoding='utf-8'), b''), encoding='utf-8')) for container in dataset_containers: # create dataset model dict dataset = Dataset() dataset['source_url'] = res.url # title if soup_parser.head.find(name='meta', attrs={'name': 'DC.title' }) is None: dataset['title'] = str(soup_parser.head.\ find(name='title').string).strip() else: dataset['title'] = soup_parser.head.find( name='meta', attrs={'name': 'DC.title'})['content'] # name dataset['name'] = slugify(dataset['title']) dataset['publisher'] = publisher # description if soup_parser.head.find(name='meta', attrs={'name': 'DC.description' }) is None: dataset['notes'] = str( soup_parser.body.find(class_='headersLevel1', recursive=True).string).strip() else: dataset['notes'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.description'})['content'] # if after searching, description is still not satisfactorily set if dataset['notes'] is None or str(dataset['notes']).strip() == "": # set description to document title dataset['notes'] = dataset['title'] # tags if soup_parser.head.find(name='meta', attrs={'name': 'keywords' }) is None: dataset['tags'] = '' else: dataset['tags'] = soup_parser.head.\ find(name='meta', attrs={'name': 'keywords'})['content'] # date if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid' }) is None: dataset['date'] = '' else: dataset['date'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.date.valid'})['content'] dataset['contact_person_name'] = "" dataset['contact_person_email'] = "" # specify the collection which the dataset belongs to if collection: # if collection exist dataset['collection'] = collection dataset['resources'] = list() # add resources from the 'container' to the dataset page_resource_links = container.find_all( name='a', href=base_parser.resource_checker, recursive=True) for resource_link in page_resource_links: resource = Resource(source_url=res.url, url=resource_link['href']) try: resource['name'] = str(resource_link.find_parent(name='ul').\ find_previous_sibling(name=True)) except: resource['name'] = str(resource_link.string).strip() resource['name'] += " " + str( resource_link.parent.contents[0]).strip() resource['name'] = re.sub(r'(</.+>)', '', resource['name']) resource['name'] = re.sub(r'(<.+>)', '', resource['name']) resource['description'] = str(resource_link.\ find_parent(class_='contentText').contents[0].string).strip() resource['description'] = re.sub(r'(</.+>)', '', resource['description']) resource['description'] = re.sub(r'(<.+>)', '', resource['description']) # get the format of the resource from the file extension of the link resource_format = resource_link['href']\ [resource_link['href'].rfind('.') + 1:] resource['format'] = resource_format # Add header information to resource object resource['headers'] = h.get_resource_headers( res.url, resource_link['href']) # add the resource to collection of resources dataset['resources'].append(resource) if len(dataset['resources']) == 0: continue yield dataset
def parse(res) -> dict: """ function parses content to create a dataset model """ # create parser object soup_parser = bs4.BeautifulSoup(res.text, 'html5lib') dataset_containers = soup_parser.body.find_all(id='maincontent', recursive=True) # check if this page is a collection (i.e. collection of datasets) if len(dataset_containers) > 0: # this is a collection # create the collection (with a source) collection = h.extract_dataset_collection_from_url(collection_url=res.url, namespace="all", source_url=\ str(res.request.headers.get(str(b'Referer', encoding='utf-8'), b''), encoding='utf-8')) for container in dataset_containers: # create dataset model dict dataset = Dataset() dataset['source_url'] = res.url if dict(enumerate( container.find('div').find_all('div'))).get(0) is None: # get the first available div element dataset['title'] = str(container.find(name='div').\ string).strip() else: # get the 1st div element from the first avaialble div dataset['title'] = str(container.find('div').find_all('div')[0].\ string).strip() if dataset['title'] is None or dataset['title'] == '': dataset['title'] = str(soup_parser.head.\ find(name='title').string).strip() # replace all non-word characters (e.g. ?/) with '-' dataset['name'] = slugify(dataset['title']) # get publisher from parent package name dataset['publisher'] = __package__.split('.')[-2] if container.select_one('p') is not None: # get the first available p element dataset['notes'] = str(container.select_one('p').string).\ strip() elif dict(enumerate(container.find_all('div'))).get(1) is not None: # get the 2nd div element dataset['notes'] = str(container.find_all('div')[1].\ string).strip() else: # get the 2nd div element from the 1st avialble div element dataset['notes'] = str(container.\ find('div').find_all('div')[1].\ string).strip() # if no notes/description still not available (after best efforts), # default to dataset title if dataset['notes'] is None or dataset['notes'] == '': dataset['notes'] = dataset['title'] dataset['tags'] = '' dataset['date'] = '' dataset['contact_person_name'] = "" dataset['contact_person_email'] = "" # specify the collection which the dataset belongs to if collection: # if collection exist dataset['collection'] = collection dataset['resources'] = list() # add resources from the 'container' to the dataset page_resource_links = container.find_all( name='a', href=base_parser.resource_checker, recursive=True) for resource_link in page_resource_links: resource = Resource(source_url=res.url, url=resource_link['href'], name=str(resource_link.string).strip()) # get the format of the resource from the file extension of the link resource_format = resource_link['href']\ [resource_link['href'].rfind('.') + 1:] resource['format'] = resource_format # Add header information to resource object resource['headers'] = h.get_resource_headers( res.url, resource_link['href']) # add the resource to collection of resources dataset['resources'].append(resource) # if this dataset alread has data resource files look for # document resource files if len(dataset['resources']) > 0: yield dataset
def parse(res) -> dict: """ function parses content to create a dataset model """ # create parser object soup_parser = bs4.BeautifulSoup(res.text, 'html5lib') dataset_containers = soup_parser.body.find_all(class_='contentText', recursive=True) for container in dataset_containers: # create dataset model dict dataset = Dataset() dataset['source_url'] = res.url if soup_parser.head.find(name='meta', attrs={'name': 'DC.title' }) is None: dataset['title'] = str(soup_parser.head.\ find(name='title').string).strip() else: dataset['title'] = soup_parser.head.find( name='meta', attrs={'name': 'DC.title'})['content'] # replace all non-word characters (e.g. ?/) with '-' dataset['name'] = slugify(dataset['title']) if soup_parser.head.find(name='meta', attrs={'name': 'ED.office' }) is None: dataset['publisher'] = __package__.split('.')[-2] else: dataset['publisher'] = soup_parser.head.\ find(name='meta', attrs={'name': 'ED.office'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'DC.description' }) is None: dataset['notes'] = str( soup_parser.body.find(class_='headersLevel1', recursive=True).string).strip() else: dataset['notes'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.description'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'keywords' }) is None: dataset['tags'] = '' else: dataset['tags'] = soup_parser.head.\ find(name='meta', attrs={'name': 'keywords'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid' }) is None: dataset['date'] = '' else: dataset['date'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.date.valid'})['content'] dataset['contact_person_name'] = "" dataset['contact_person_email'] = "" dataset['resources'] = list() # add resources from the 'container' to the dataset page_resource_links = soup_parser.body.find_all( name='a', href=base_parser.resource_checker, recursive=True) for resource_link in page_resource_links: resource = Resource(source_url=res.url, url=resource_link['href']) for child in resource_link.parent.children: # check if the resource is contained in a table if resource_link.parent.name == 'td': resource['name'] = str( resource_link.find_parent( name='tr').contents[1]).strip() else: resource['name'] = str(child).strip() if re.sub(r'(<.+>)', '', re.sub(r'(</.+>)', '', resource['name'])) != "": break resource['name'] = re.sub(r'(</.+>)', '', resource['name']) resource['name'] = re.sub(r'(<.+>)', '', resource['name']) if resource_link.find_parent(class_='contentText').\ find_previous_sibling(class_='headersLevel2'): # concatenate the text content of parents with # class 'headersLevel1' & 'headersLevel2' resource['description'] = str(resource_link.\ find_parent(class_='contentText').\ find_previous_sibling(class_='headersLevel1').\ contents[0]).strip() +\ " - " + str(resource_link.\ find_parent(class_='contentText').\ find_previous_sibling(class_='headersLevel2').\ contents[0]).strip() resource['description'] = re.sub(r'(</.+>)', '', resource['description']) resource['description'] = re.sub(r'(<.+>)', '', resource['description']) else: # concatenate the text content of parents with # class 'headersLevel1' & 'contentText' resource['description'] = str(resource_link.\ find_parent(class_='contentText').\ find_previous_sibling(class_='headersLevel1').\ contents[0]).strip() +\ " - " + str(resource_link.\ find_parent(class_='contentText').\ contents[0].string or resource_link.\ find_parent(class_='contentText').\ contents[0]).strip() resource['description'] = re.sub(r'(</.+>)', '', resource['description']) resource['description'] = re.sub(r'(<.+>)', '', resource['description']) # get the format of the resource from the file extension of the link resource_format = resource_link['href']\ [resource_link['href'].rfind('.') + 1:] resource['format'] = resource_format # Add header information to resource object resource['headers'] = h.get_resource_headers( res.url, resource_link['href']) # add the resource to collection of resources dataset['resources'].append(resource) yield dataset
def parse(res): """ function parses content to create a dataset model or return None if no resource in content""" # ensure that the response text gotten is a string if not isinstance(getattr(res, 'text', None), str): return None try: soup_parser = bs4.BeautifulSoup(res.text, 'html5lib') except: return None # check if the content contains any of the data extensions if soup_parser.body.find(name='a', href=base_parser.resource_checker, recursive=True) is None: # no resource on this page, so return None return None # if code gets here, at least one resource was found dataset_containers = soup_parser.body.find_all(name='div', id='maincontent', recursive=True) for container in dataset_containers: # create dataset model dict dataset = Dataset() # dataset source url dataset['source_url'] = res.url # dataset title if soup_parser.head.find(name='meta', attrs={'name': 'DC.title' }) is None: dataset['title'] = str(soup_parser.head.\ find(name='title').string).strip() else: dataset['title'] = soup_parser.head.find( name='meta', attrs={'name': 'DC.title'})['content'] # dataset name dataset['name'] = slugify(dataset['title']) # publisher if soup_parser.head.find(name='meta', attrs={'name': 'ED.office' }) is None: dataset['publisher'] = __package__.split('.')[-2] else: dataset['publisher'] = soup_parser.head.\ find(name='meta', attrs={'name': 'ED.office'})['content'] # description if soup_parser.head.find(name='meta', attrs={'name': 'DC.description' }) is None: dataset['notes'] = dataset['title'] else: dataset['notes'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.description'})['content'] # tags if soup_parser.head.find(name='meta', attrs={'name': 'keywords' }) is None: dataset['tags'] = '' else: dataset['tags'] = soup_parser.head.\ find(name='meta', attrs={'name': 'keywords'})['content'] # date if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid' }) is None: dataset['date'] = '' else: dataset['date'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.date.valid'})['content'] dataset['contact_person_name'] = "" dataset['contact_person_email'] = "" dataset['resources'] = list() # add resources from the 'container' to the dataset page_resource_links = container.find_all( name='a', href=base_parser.resource_checker, recursive=True) for resource_link in page_resource_links: resource = Resource(source_url=res.url, url=resource_link['href']) # get the resource name iteratively for child in resource_link.parent.children: resource['name'] = str(child).strip() if re.sub(r'(<.+>)', '', re.sub(r'(</.+>)', '', resource['name'])) != "": break resource['name'] = re.sub(r'(</.+>)', '', resource['name']) resource['name'] = re.sub(r'(<.+>)', '', resource['name']) if resource_link.find_parent(class_='contentText').\ find_previous_sibling(class_='headersLevel2'): # concatenate the text content of parents with # class 'headersLevel1' & 'headersLevel2' resource['description'] = str(resource_link.\ find_parent(class_='contentText').\ find_previous_sibling(class_='headersLevel1').\ contents[0]).strip() +\ " - " + str(resource_link.\ find_parent(class_='contentText').\ find_previous_sibling(class_='headersLevel2').\ contents[1]).strip() resource['description'] = re.sub(r'(</.+>)', '', resource['description']) resource['description'] = re.sub(r'(<.+>)', '', resource['description']) else: # concatenate the text content of parents with # class 'headersLevel1' & 'contentText' resource['description'] = str(resource_link.\ find_parent(class_='contentText').\ find_previous_sibling(class_='headersLevel1').\ contents[0]).strip() +\ " - " + str(resource_link.\ find_parent(class_='contentText').\ contents[0].string or resource_link.\ find_parent(class_='contentText').\ contents[0]).strip() resource['description'] = re.sub(r'(</.+>)', '', resource['description']) resource['description'] = re.sub(r'(<.+>)', '', resource['description']) # after getting the best description possible, remove any " - " # and trailing white space resource['description'] = re.sub(r'^\s+\-\s+', '', resource['description']) resource['description'] = resource['description'].strip() # get the format of the resource from the file extension of the link resource_format = resource_link['href']\ [resource_link['href'].rfind('.') + 1:] resource['format'] = resource_format # Add header information to resource object resource['headers'] = h.get_resource_headers( res.url, resource_link['href']) # add the resource to collection of resources dataset['resources'].append(resource) if len(dataset['resources']) == 0: continue yield dataset
def parse(res) -> dict: """ function parses content to create a dataset model """ # create parser object soup_parser = bs4.BeautifulSoup(res.text, 'html5lib') dataset_containers = soup_parser.body.find_all(class_='contentText', recursive=True) for container in dataset_containers: # create dataset model dict dataset = Dataset() dataset['source_url'] = res.url if soup_parser.head.find(name='meta',attrs={'name': 'DC.title'}) is None: dataset['title'] = str(soup_parser.head.\ find(name='title').string).strip() else: dataset['title'] = soup_parser.head.find(name='meta', attrs={'name': 'DC.title'})['content'] # replace all non-word characters (e.g. ?/) with '-' dataset['name'] = slugify(dataset['title']) if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'}) is None: dataset['publisher'] = __package__.split('.')[-2] else: dataset['publisher'] = soup_parser.head.\ find(name='meta', attrs={'name': 'ED.office'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'}) is None: dataset['notes'] = str(soup_parser.body.find(class_='headersLevel1', recursive=True).string).strip() else: dataset['notes'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.description'})['content'] # if no notes/description still not available (after best efforts), # default to dataset title if dataset['notes'] is None or dataset['notes'] == '': dataset['notes'] = dataset['title'] if soup_parser.head.find(name='meta', attrs={'name': 'keywords'}) is None: dataset['tags'] = '' else: dataset['tags'] = soup_parser.head.\ find(name='meta', attrs={'name': 'keywords'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'}) is None: dataset['date'] = '' else: dataset['date'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.date.valid'})['content'] dataset['contact_person_name'] = "" dataset['contact_person_email'] = "" dataset['resources'] = list() # add resources from the 'container' to the dataset page_resource_links = container.find_all(name='a', href=base_parser.resource_checker, recursive=True) for resource_link in page_resource_links: resource = Resource(source_url=res.url, url=resource_link['href']) resource['name'] = str(resource_link.find_parent(name='ul').\ find_previous_sibling(name=True)) resource['name'] += " " + str(resource_link.parent.contents[0]).strip() resource['name'] = re.sub(r'(</.+>)', '', resource['name']) resource['name'] = re.sub(r'(<.+>)', '', resource['name']) resource['description'] = str(resource_link.\ find_parent(class_='contentText').contents[0].string).strip() resource['description'] = re.sub(r'(</.+>)', '', resource['description']) resource['description'] = re.sub(r'(<.+>)', '', resource['description']) # get the format of the resource from the file extension of the link resource_format = resource_link['href']\ [resource_link['href'].rfind('.') + 1:] # Add header information to resource object resource['headers'] = h.get_resource_headers(res.url, resource_link['href']) # add the resource to collection of resources dataset['resources'].append(resource) if len(dataset['resources']) == 0: continue yield dataset
def parse(res) -> dict: """ function parses content to create a dataset model """ # ensure that the response text gotten is a string if not isinstance(getattr(res, 'text', None), str): return None try: soup_parser = bs4.BeautifulSoup(res.text, 'html5lib') except: return None dataset_containers = soup_parser.body.select( '.container .content:not(.node-page)') # check if this page is a collection (i.e. collection of datasets) if len(dataset_containers) > 0: # this is a collection # create the collection (with a source) collection = h.extract_dataset_collection_from_url(collection_url=res.url, namespace="all", source_url=\ str(res.request.headers.get(str(b'Referer', encoding='utf-8'), b''), encoding='utf-8')) for container in dataset_containers: # create dataset model dict dataset = Dataset() dataset['source_url'] = res.url if soup_parser.head.find(name='meta', attrs={'name': 'DC.title' }) is None: dataset['title'] = str(soup_parser.head.\ find(name='title').string).strip() else: dataset['title'] = soup_parser.head.find( name='meta', attrs={'name': 'DC.title'})['content'] # replace all non-word characters (e.g. ?/) with '-' dataset['name'] = slugify(dataset['title']) if soup_parser.head.find(name='meta', attrs={'name': 'ED.office' }) is None: dataset['publisher'] = __package__.split('.')[-2] else: dataset['publisher'] = soup_parser.head.\ find(name='meta', attrs={'name': 'ED.office'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'DC.description' }) is None: dataset['notes'] = dataset['title'] else: dataset['notes'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.description'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'keywords' }) is None: dataset['tags'] = '' else: dataset['tags'] = soup_parser.head.\ find(name='meta', attrs={'name': 'keywords'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid' }) is None: dataset['date'] = '' else: dataset['date'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.date.valid'})['content'] dataset['contact_person_name'] = "" dataset['contact_person_email'] = "" # specify the collection which the dataset belongs to if collection: # if collection exist dataset['collection'] = collection dataset['resources'] = list() # add resources from the 'container' to the dataset page_resource_links = container.find_all( name='a', href=base_parser.resource_checker, recursive=True) for resource_link in page_resource_links: resource = Resource(source_url=res.url, url=resource_link['href']) # get the resource name iteratively for child in resource_link.parent.children: resource['name'] = str(child).strip() if re.sub(r'(<.+>)', '', re.sub(r'(</.+>)', '', resource['name'])) != "": break resource['name'] = re.sub(r'(</.+>)', '', resource['name']) resource['name'] = re.sub(r'(<.+>)', '', resource['name']) # concatenate the content of resource link's parent 1st & 2nd child # class 'headersLevel1' & 'headersLevel2' resource['description'] = str(resource_link.\ parent.contents[0]).strip() +\ " - " + str(dict(enumerate(container.\ contents)).get(1, '')).strip() resource['description'] = re.sub(r'(</.+>)', '', resource['description']) resource['description'] = re.sub(r'(<.+>)', '', resource['description']) # get the format of the resource from the file extension of the link resource_format = resource_link['href']\ [resource_link['href'].rfind('.') + 1:] resource['format'] = resource_format # Add header information to resource object resource['headers'] = h.get_resource_headers( res.url, resource_link['href']) # add the resource to collection of resources dataset['resources'].append(resource) if len(dataset['resources']) == 0: continue yield dataset
def parse(res) -> dict: """ function parses content to create a dataset model """ # create parser object soup_parser = bs4.BeautifulSoup(res.text, 'html5lib') dataset_containers = soup_parser.body.select('table') # check if this page is a collection (i.e. collection of datasets) if len(dataset_containers) > 0: # this is a collection # create the collection (with a source) collection = h.extract_dataset_collection_from_url(collection_url=res.url, namespace="all", source_url=\ str(res.request.headers.get(str(b'Referer', encoding='utf-8'), b''), encoding='utf-8')) for container in dataset_containers: # create dataset model dict dataset = Dataset() dataset['source_url'] = res.url if (soup_parser.head.find(name='meta',attrs={'name': 'DC.title'}) is None)\ or (soup_parser.head.find(name='meta',attrs={'name': 'DC.title'})['content']\ is None or\ soup_parser.head.find(name='meta',attrs={'name': 'DC.title'})['content'] == ""): dataset['title'] = str(soup_parser.head.\ find(name='title').string).strip() else: dataset['title'] = soup_parser.head.find( name='meta', attrs={'name': 'DC.title'})['content'] # replace all non-word characters (e.g. ?/) with '-' dataset['name'] = slugify(dataset['title']) if soup_parser.head.find(name='meta', attrs={'name': 'ED.office' }) is None: # Use nces by default since this parser is used only when there is an `nces` class in the page dataset['publisher'] = 'nces' else: dataset['publisher'] = soup_parser.head.\ find(name='meta', attrs={'name': 'ED.office'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'DC.description' }) is None: dataset['notes'] = dataset['title'] else: dataset['notes'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.description'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'keywords' }) is None: dataset['tags'] = '' else: dataset['tags'] = soup_parser.head.\ find(name='meta', attrs={'name': 'keywords'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid' }) is None: dataset['date'] = '' else: dataset['date'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.date.valid'})['content'] dataset['contact_person_name'] = "" dataset['contact_person_email'] = "" # specify the collection which the dataset belongs to if collection: # if collection exist dataset['collection'] = collection dataset['resources'] = list() # add resources from the 'container' to the dataset page_resource_links = container.find_all( name='a', href=base_parser.resource_checker, recursive=True) for resource_link in page_resource_links: resource = Resource(source_url=res.url, url=resource_link['href']) # get the resource name if soup_parser.find(name='th', class_='title', recursive=True) is not None: resource['name'] = str( soup_parser.find(name='th', class_='title', recursive=True)) elif soup_parser.body.\ find(name='div', class_='title') is not None: resource['name'] = str(soup_parser.body.\ find(name='div', class_='title').string).strip() else: # get the resource name iteratively for child in resource_link.parent.children: resource['name'] = str(child).strip() if re.sub(r'(<.+>)', '', re.sub(r'(</.+>)', '', resource['name'])) != "": break # remove any html tags from the resource name resource['name'] = re.sub(r'(</.+>)', '', resource['name']) resource['name'] = re.sub(r'(<[a-z]+/>)', '', resource['name']) resource['name'] = re.sub(r'(<.+>)', '', resource['name']) resource['name'] = resource['name'].strip() # the page structure has NO description available for resources resource['description'] = '' # get the format of the resource from the file extension of the link resource_format = resource_link['href']\ [resource_link['href'].rfind('.') + 1:] resource['format'] = resource_format # Add header information to resource object resource['headers'] = h.get_resource_headers( res.url, resource_link['href']) # add the resource to collection of resources dataset['resources'].append(resource) # check if created dataset has resources attached. if len(dataset['resources']) == 0: # no resources so don't yield it continue # skip this loop yield dataset
def parse(res, publisher) -> dict: """ function parses content to create a dataset model """ # create parser object soup_parser = bs4.BeautifulSoup(res.text, 'html5lib') dataset_containers = soup_parser.find_all(name='body') # check if this page is a collection (i.e. collection of datasets) if len(dataset_containers) > 0: # this is a collection # create the collection (with a source) collection = h.extract_dataset_collection_from_url(collection_url=res.url, namespace="all", source_url=\ str(res.request.headers.get(str(b'Referer', encoding='utf-8'), b''), encoding='utf-8')) for container in dataset_containers: # create dataset model dict dataset = Dataset() dataset['source_url'] = res.url if soup_parser.head.find(name='meta', attrs={'name': 'DC.title' }) is None: dataset['title'] = str(soup_parser.head.\ find(name='title')).strip() else: dataset['title'] = soup_parser.head.find( name='meta', attrs={'name': 'DC.title'})['content'] # replace all non-word characters (e.g. ?/) with '-' dataset['name'] = slugify(dataset['title']) dataset['publisher'] = publisher if soup_parser.head.find(name='meta', attrs={'name': 'DC.description' }) is None: dataset['notes'] = dataset['title'] else: dataset['notes'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.description'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'keywords' }) is None: dataset['tags'] = '' else: dataset['tags'] = soup_parser.head.\ find(name='meta', attrs={'name': 'keywords'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid' }) is None: dataset['date'] = '' else: dataset['date'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.date.valid'})['content'] dataset['contact_person_name'] = "" dataset['contact_person_email'] = "" # specify the collection which the dataset belongs to if collection: # if collection exist dataset['collection'] = collection dataset['resources'] = list() # add resources from the 'container' to the dataset page_resource_links = container.find_all( name='a', href=base_parser.resource_checker, recursive=True) for resource_link in page_resource_links: resource = Resource(source_url=res.url, url=resource_link['href']) # get the resource name iteratively for child in resource_link.parent.children: if resource_link.parent.name == 'td': resource['name'] = str( resource_link.find_parent( name='tr').contents[1]).strip() else: resource['name'] = str(child).strip() if re.sub(r'(<.+>)', '', re.sub(r'(</.+>)', '', resource['name'])) != "": break resource['name'] = re.sub(r'(</.+>)', '', resource['name']) resource['name'] = re.sub(r'(<.+>)', '', resource['name']) if resource_link.parent.parent.find(name=True): # concatenate the text content of parents with # resource name resource['description'] = str(resource_link.parent.parent.find(name=True)).strip() +\ " - " + str(resource['name']).strip() resource['description'] = re.sub(r'(</.+>)', '', resource['description']) resource['description'] = re.sub(r'(<.+>)', '', resource['description']) resource['description'] = re.sub(r'^\s+\-\s+', '', resource['description']) else: # use the resource name for description resource['description'] = str(resource['name']).strip() resource['description'] = re.sub(r'(</.+>)', '', resource['description']) resource['description'] = re.sub(r'(<.+>)', '', resource['description']) # after getting the best description possible, strip any white space resource['description'] = resource['description'].strip() # get the format of the resource from the file extension of the link resource_format = resource_link['href']\ [resource_link['href'].rfind('.') + 1:] resource['format'] = resource_format # Add header information to resource object resource['headers'] = h.get_resource_headers( res.url, resource_link['href']) # add the resource to collection of resources dataset['resources'].append(resource) if len(dataset['resources']) == 0: continue yield dataset
def parse(res) -> dict: """ function parses content to create a dataset model """ # ensure that the response text gotten is a string if not isinstance(getattr(res, 'text', None), str): return None try: soup_parser = bs4.BeautifulSoup(res.text, 'html5lib') except: return None dataset_containers = soup_parser.body.find_all(name='div', class_='container', recursive=True) # check if this page is a collection (i.e. collection of datasets) if len(dataset_containers) > 0: # this is a collection # create the collection (with a source) collection = h.extract_dataset_collection_from_url(collection_url=res.url, namespace="all", source_url=\ str(res.request.headers.get(str(b'Referer', encoding='utf-8'), b''), encoding='utf-8')) for container in dataset_containers: # create dataset model dict dataset = Dataset() dataset['source_url'] = res.url if soup_parser.head.find(name='meta', attrs={'name': 'DC.title' }) is None: dataset['title'] = str(soup_parser.head.\ find(name='title').string).strip() else: dataset['title'] = soup_parser.head.find( name='meta', attrs={'name': 'DC.title'})['content'] # replace all non-word characters (e.g. ?/) with '-' dataset['name'] = slugify(dataset['title']) if soup_parser.head.find(name='meta', attrs={'name': 'ED.office' }) is None: dataset['publisher'] = __package__.split('.')[-2] else: dataset['publisher'] = soup_parser.head.\ find(name='meta', attrs={'name': 'ED.office'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'DC.description' }) is None: dataset['notes'] = dataset['title'] else: dataset['notes'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.description'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'keywords' }) is None: dataset['tags'] = '' else: dataset['tags'] = soup_parser.head.\ find(name='meta', attrs={'name': 'keywords'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid' }) is None: dataset['date'] = '' else: dataset['date'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.date.valid'})['content'] dataset['contact_person_name'] = "" dataset['contact_person_email'] = "" # specify the collection which the dataset belongs to if collection: # if collection exist dataset['collection'] = collection dataset['resources'] = list() # add resources from the 'container' to the dataset page_resource_links = container.find_all( name='a', href=base_parser.resource_checker, recursive=True) # hold the list of resource names collected strictly by traversing resource parent traverse_parent_unique_resource_names = list() for resource_link in page_resource_links: resource = Resource(source_url=res.url, url=resource_link['href']) # get the resource name iteratively for child in resource_link.parent.children: resource['name'] = str(child).strip() if re.sub(r'(<.+>)', '', re.sub(r'(</.+>)', '', resource['name'])) != "": break resource['name'] = re.sub(r'(</.+>)', '', resource['name']) resource['name'] = re.sub(r'(<.+>)', '', resource['name']) # to ensure that the same name is not repeated for a resource when using parental traversal, # check if the retrieved name has been collected and assigned before if resource['name'] in traverse_parent_unique_resource_names: # the retrieved resource name has already been assigned to another resource # then retrieve the content of the 'a' tag as the name resource['name'] = " ".join( list( map(lambda string: str(string), resource_link.stripped_strings))) else: # since resource name was retrieved by traversing parent, # add resource name to the list traverse_parent_unique_resource_names.append(resource['name']) if resource_link.find_parent(name='p'): resource['description'] = str(resource_link.\ find_parent(name='p'). contents[0]).strip() resource['description'] = re.sub(r'(</.+>)', '', resource['description']) resource['description'] = re.sub(r'(<.+>)', '', resource['description']) else: # set description to name of resource resource['description'] = resource['name'] # get the format of the resource from the file extension of the link resource_format = resource_link['href']\ [resource_link['href'].rfind('.') + 1:] resource['format'] = resource_format # Add header information to resource object resource['headers'] = h.get_resource_headers( res.url, resource_link['href']) # add the resource to collection of resources dataset['resources'].append(resource) if len(dataset['resources']) == 0: continue yield dataset
def parse(res) -> dict: """ function parses content to create a dataset model """ # ensure that the response text gotten is a string if not isinstance(getattr(res, 'text', None), str): return None # create parser object try: soup_parser = bs4.BeautifulSoup(res.text, 'html5lib') except: return None dataset_containers = soup_parser.body.find_all(class_='contentText', recursive=True) # check if this page is a collection (i.e. collection of datasets) if len(dataset_containers) > 0: # this is a collection # create the collection (with a source) collection = h.extract_dataset_collection_from_url(collection_url=res.url, namespace="all", source_url=\ str(res.request.headers.get(str(b'Referer', encoding='utf-8'), b''), encoding='utf-8')) for container in dataset_containers: # create dataset model dict dataset = Dataset() dataset['source_url'] = res.url if soup_parser.head.find(name='meta', attrs={'name': 'DC.title' }) is None: dataset['title'] = str(soup_parser.head.\ find(name='title').string).strip() else: dataset['title'] = soup_parser.head.find( name='meta', attrs={'name': 'DC.title'})['content'] # replace all non-word characters (e.g. ?/) with '-' dataset['name'] = slugify(dataset['title']) if soup_parser.head.find(name='meta', attrs={'name': 'ED.office' }) is None: dataset['publisher'] = __package__.split('.')[-2] else: dataset['publisher'] = soup_parser.head.\ find(name='meta', attrs={'name': 'ED.office'})['content'] try: if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'}) is None: dataset['notes'] = str( soup_parser.body.find(class_='headersLevel1', recursive=True).string).strip() else: dataset['notes'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.description'})['content'] except: dataset['notes'] = dataset['title'] # if despite best efforts 'notes' is still empty or None if not dataset.get('notes', None): dataset['notes'] = dataset['title'] if soup_parser.head.find(name='meta', attrs={'name': 'keywords' }) is None: dataset['tags'] = '' else: dataset['tags'] = soup_parser.head.\ find(name='meta', attrs={'name': 'keywords'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid' }) is None: dataset['date'] = '' else: dataset['date'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.date.valid'})['content'] dataset['contact_person_name'] = "" dataset['contact_person_email'] = "" # specify the collection which the dataset belongs to if collection: # if collection exist dataset['collection'] = collection dataset['resources'] = list() # add resources from the 'container' to the dataset page_resource_links = soup_parser.body.find_all( name='a', href=base_parser.resource_checker, recursive=True) for resource_link in page_resource_links: resource = Resource(source_url=res.url, url=resource_link['href']) for child in resource_link.parent.children: # check if the resource is contained in a table if resource_link.parent.name == 'td': resource['name'] = str( resource_link.find_parent( name='tr').contents[1]).strip() else: resource['name'] = str(child).strip() if re.sub(r'(<.+>)', '', re.sub(r'(</.+>)', '', resource['name'])) != "": break resource['name'] = re.sub(r'(</.+>)', '', resource['name']) resource['name'] = re.sub(r'(<.+>)', '', resource['name']) if resource_link.find_parent(class_='contentText').\ find_previous_sibling(class_='headersLevel2'): # concatenate the text content of parents with # class 'headersLevel1' & 'headersLevel2' resource['description'] = str(resource_link.\ find_parent(class_='contentText').\ find_previous_sibling(class_='headersLevel1').\ contents[0]).strip() +\ " - " + str(resource_link.\ find_parent(class_='contentText').\ find_previous_sibling(class_='headersLevel2').\ contents[0]).strip() resource['description'] = re.sub(r'(</.+>)', '', resource['description']) resource['description'] = re.sub(r'(<.+>)', '', resource['description']) else: # concatenate the text content of parents with # class 'headersLevel1' & 'contentText' resource['description'] = str(resource_link.\ find_parent(class_='contentText').\ find_previous_sibling(class_='headersLevel1').\ contents[0]).strip() +\ " - " + str(resource_link.\ find_parent(class_='contentText').\ contents[0].string or resource_link.\ find_parent(class_='contentText').\ contents[0]).strip() resource['description'] = re.sub(r'(</.+>)', '', resource['description']) resource['description'] = re.sub(r'(<.+>)', '', resource['description']) # after getting the best description possible, remove any " - " # and trailing white space resource['description'] = re.sub(r'^\s+\-\s+', '', resource.get('description', '')) resource['description'] = resource['description'].strip() # get the format of the resource from the file extension of the link resource_format = resource_link['href']\ [resource_link['href'].rfind('.') + 1:] resource['format'] = resource_format # Add header information to resource object resource['headers'] = h.get_resource_headers( res.url, resource_link['href']) # add the resource to collection of resources dataset['resources'].append(resource) if len(dataset['resources']) == 0: continue yield dataset
def parse(res, publisher) -> dict: """ function parses content to create a dataset model """ # create parser object soup_parser = bs4.BeautifulSoup(res.text, 'html5lib') dataset_containers = soup_parser.find_all(name='body') # check if this page is a collection (i.e. collection of datasets) if len(dataset_containers) > 0: # this is a collection # create the collection (with a source) collection = h.extract_dataset_collection_from_url(collection_url=res.url, namespace="all", source_url=\ str(res.request.headers.get(str(b'Referer', encoding='utf-8'), b''), encoding='utf-8')) for container in dataset_containers: # create dataset model dict dataset = Dataset() dataset['source_url'] = res.url dataset['title'] = soup_parser.select( 'div.MainContent > div.IndicatorList > h4 > a')[0].string # replace all non-word characters (e.g. ?/) with '-' dataset['name'] = slugify(dataset['title']) dataset['publisher'] = publisher dataset['notes'] = dataset['title'] if soup_parser.find(name='div', attrs={'class': 'ReportSource'}): report_source = soup_parser.find(name='div', attrs={'class': 'ReportSource'}) dataset[ 'notes'] = f"{dataset['notes']}<br>\n{report_source.string}" if 'statedetail.aspx' in res.url: dataset[ 'notes'] = f"{dataset['notes']}<br>\n<a href='{res.url.replace('statedetail', 'moreinfo')}'>More Info</a>" if soup_parser.head.find(name='meta', attrs={'name': 'keywords' }) is None: dataset['tags'] = '' else: dataset['tags'] = soup_parser.head.\ find(name='meta', attrs={'name': 'keywords'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid' }) is None: dataset['date'] = '' else: dataset['date'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.date.valid'})['content'] dataset['contact_person_name'] = "" dataset['contact_person_email'] = "" # specify the collection which the dataset belongs to if collection: # if collection exist dataset['collection'] = collection dataset['resources'] = list() # add resources from the 'container' to the dataset page_resource_links = container.find_all( name='a', href=base_parser.resource_checker, recursive=True) for resource_link in page_resource_links: resource = Resource(source_url=res.url, url=resource_link['href']) file_name = resource_link['href'].split('.')[-2] resource['name'] = h.unslugify(file_name.split('/')[-1].strip()) try: resource['description'] = report_source.string except: resource['description'] = dataset['title'] # get the format of the resource from the file extension of the link resource_format = resource_link['href']\ [resource_link['href'].rfind('.') + 1:] resource['format'] = resource_format # Add header information to resource object resource['headers'] = h.get_resource_headers( res.url, resource_link['href']) # add the resource to collection of resources dataset['resources'].append(resource) if len(dataset['resources']) == 0: continue yield dataset
def parse(res, publisher) -> dict: """ function parses content to create a dataset model """ # create parser object soup_parser = bs4.BeautifulSoup(res.text, 'html5lib') dataset_containers = soup_parser.body.find_all(name='div', id='page', recursive=True) # check if this page is a collection (i.e. collection of datasets) if len(dataset_containers) > 0: # this is a collection # create the collection (with a source) collection = h.extract_dataset_collection_from_url(collection_url=res.url, namespace="all", source_url=\ str(res.request.headers.get(str(b'Referer', encoding='utf-8'), b''), encoding='utf-8')) # print(dataset_containers) for container in dataset_containers: # create dataset model dict dataset = Dataset() dataset['source_url'] = res.url # import ipdb; ipdb.set_trace() try: dataset['title'] = str( container.find(class_='site-title').string).strip() except: dataset['title'] = str(soup_parser.head.\ find(name='title').string).strip() # replace all non-word characters (e.g. ?/) with '-' dataset['name'] = slugify(dataset['title']) # get publisher from parent package name dataset['publisher'] = publisher try: dataset['notes'] = str(container.find(_class='site-description').string).\ strip() except: dataset['notes'] = dataset['title'] dataset['tags'] = '' dataset['date'] = '' dataset['contact_person_name'] = "" dataset['contact_person_email'] = "" # specify the collection which the dataset belongs to if collection: # if collection exist dataset['collection'] = collection dataset['resources'] = list() # add resources from the 'container' to the dataset page_resource_links = container.find_all( name='a', href=base_parser.resource_checker, recursive=True) for resource_link in page_resource_links: resource = Resource(source_url=res.url, url=resource_link['href'], name=str(resource_link.string).strip()) # get the format of the resource from the file extension of the link resource_format = resource_link['href']\ [resource_link['href'].rfind('.') + 1:] resource['format'] = resource_format # Add header information to resource object resource['headers'] = h.get_resource_headers( res.url, resource_link['href']) # add the resource to collection of resources dataset['resources'].append(resource) # if this dataset alread has data resource files look for # document resource files if len(dataset['resources']) > 0: yield dataset
def parse(res) -> dict: """ function parses content to create a dataset model """ # create parser object soup_parser = bs4.BeautifulSoup(res.text, 'html5lib') dataset_containers = soup_parser.body.select('div.MainContent') for container in dataset_containers: # create dataset model dict dataset = Dataset() dataset['source_url'] = res.url if (soup_parser.head.find(name='meta',attrs={'name': 'DC.title'}) is None)\ or (soup_parser.head.find(name='meta',attrs={'name': 'DC.title'})['content']\ is None or\ soup_parser.head.find(name='meta',attrs={'name': 'DC.title'})['content'] == ""): dataset['title'] = str(soup_parser.head.\ find(name='title').string).strip() else: dataset['title'] = soup_parser.head.find( name='meta', attrs={'name': 'DC.title'})['content'] # replace all non-word characters (e.g. ?/) with '-' dataset['name'] = slugify(dataset['title']) if soup_parser.head.find(name='meta', attrs={'name': 'ED.office' }) is None: dataset['publisher'] = __package__.split('.')[-2] else: dataset['publisher'] = soup_parser.head.\ find(name='meta', attrs={'name': 'ED.office'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'DC.description' }) is None: dataset['notes'] = dataset['title'] else: dataset['notes'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.description'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'keywords' }) is None: dataset['tags'] = '' else: dataset['tags'] = soup_parser.head.\ find(name='meta', attrs={'name': 'keywords'})['content'] if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid' }) is None: dataset['date'] = '' else: dataset['date'] = soup_parser.head.\ find(name='meta', attrs={'name': 'DC.date.valid'})['content'] dataset['contact_person_name'] = "" dataset['contact_person_email'] = "" dataset['resources'] = list() # add resources from the 'container' to the dataset page_resource_links = container.find_all( name='a', href=base_parser.resource_checker, recursive=True) for resource_link in page_resource_links: resource = Resource(source_url=res.url, url=resource_link['href']) # get the resource name iteratively for child in resource_link.parent.children: resource['name'] = str(child).strip() if re.sub(r'(<.+>)', '', re.sub(r'(</.+>)', '', resource['name'])) != "": break resource['name'] = re.sub(r'(</.+>)', '', resource['name']) resource['name'] = re.sub(r'(<.+>)', '', resource['name']) # the page structure has NO description available for resources resource['description'] = '' # get the format of the resource from the file extension of the link resource_format = resource_link['href']\ [resource_link['href'].rfind('.') + 1:] resource['format'] = resource_format # Add header information to resource object resource['headers'] = h.get_resource_headers( res.url, resource_link['href']) # add the resource to collection of resources dataset['resources'].append(resource) # check if created dataset has resources attached. if len(dataset['resources']) == 0: # no resources so don't yield it continue # skip this loop yield dataset