Example #1
0
def parse(res, container, dataset) -> dict:
    """ function parses content to create a dataset model """

    # add  resources from the 'container' to the dataset
    page_resource_links = container.find_all(name='option',
                                             value=base_parser.resource_checker,
                                             recursive=True)
    for resource_link in page_resource_links:
        resource = Resource(source_url=res.url,
                            url=resource_link['value'])
        # get the resource name
        resource['name'] = str(resource_link.text).strip()

        # get the format of the resource from the file extension of the link
        resource_format = resource_link['value']\
                        [resource_link['value'].rfind('.') + 1:]
        resource['format'] = resource_format

        # Add header information to resource object
        resource['headers'] = h.get_resource_headers(res.url, resource_link['value'])

        # add the resource to collection of resources
        dataset['resources'].append(resource)

    if len(dataset['resources']) == 0: # if no resources were found
        return None

    yield dataset
def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.find_all(class_='accordiontitle', recursive=True)
    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        dataset['title'] = str(container.find(class_='accordionheader').\
                            string).strip()
        if dataset['title'] is None or dataset['title'] == '':
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        # get publisher from parent package name
        dataset['publisher'] = dataset['publisher'] = __package__.split('.')[-2]

        dataset['notes'] = str(container.find(name='p').string).\
                                  strip()
        # if no notes/description available, default to dataset title
        if dataset['notes'] is None or dataset['notes'] == '':
            dataset['notes'] = dataset['title']

        dataset['tags'] = ''
        dataset['date'] = ''
        dataset['contact_person_name'] = ""
        dataset['contact_person_email'] = ""
        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(name='a',
                                                 href=base_parser.resource_checker,
                                                 recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url = res.url,
                                url = resource_link['href'],
                                name = str(resource_link.string).strip())

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        # if this dataset alread has data resource files look for
        # document resource files
        if len(dataset['resources']) > 0:
            yield dataset
Example #3
0
def parse(res, container, dataset) -> dict:
    """ function parses content to create a dataset model """

    # add  resources from the 'container' to the dataset
    page_resource_links = container.find_all(name='a',
                                             href=base_parser.resource_checker,
                                             recursive=True)
    for resource_link in page_resource_links:
        resource = Resource(source_url=res.url, url=resource_link['href'])
        # get the resource name iteratively
        for child in resource_link.parent.children:
            if resource_link.parent.name == 'td':
                resource['name'] = str(
                    resource_link.find_parent(name='tr').contents[1]).strip()
            else:
                resource['name'] = str(child).strip()
            if re.sub(r'(<.+>)', '', re.sub(r'(</.+>)', '',
                                            resource['name'])) != "":
                break
        resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
        resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

        if resource_link.parent.parent.find(name=True):

            # concatenate the text content of parents with
            # class 'headersLevel1' & 'headersLevel2'
            resource['description'] = str(resource_link.parent.parent.find(name=True)).strip() +\
                                        " - " + str(resource['name']).strip()
            resource['description'] = re.sub(r'(</.+>)', '',
                                             resource['description'])
            resource['description'] = re.sub(r'(<.+>)', '',
                                             resource['description'])
        else:
            # concatenate the text content of parents with
            # class 'headersLevel1' & 'contentText'
            resource['description'] = str(resource['name']).strip()
            resource['description'] = re.sub(r'(</.+>)', '',
                                             resource['description'])
            resource['description'] = re.sub(r'(<.+>)', '',
                                             resource['description'])

        # get the format of the resource from the file extension of the link
        resource_format = resource_link['href']\
                        [resource_link['href'].rfind('.') + 1:]
        resource['format'] = resource_format

        # Add header information to resource object
        resource['headers'] = h.get_resource_headers(res.url,
                                                     resource_link['href'])

        # add the resource to collection of resources
        dataset['resources'].append(resource)

    if len(dataset['resources']) == 0:  # if no resources were found
        return None

    yield dataset
Example #4
0
def get_all_resources(res, dataset, extensions, deny_list=[]):
    for link in LxmlLinkExtractor(deny_extensions=[],
                                  deny=deny_list).extract_links(res):
        for extension in extensions.keys():
            if link.url.endswith(extension):
                resource = Resource(
                    source_url=res.url,
                    url=link.url,
                    name=link.text,
                )
                dataset['resources'].append(resource)
def parse(res, publisher):
    """ function parses content to create a dataset model """

    # ensure that the response text gotten is a string
    if not isinstance(getattr(res, 'text', None), str):
        return None

    # create parser object
    try:
        soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')
    except:
        return None

    dataset_containers = soup_parser.body.find_all(class_='contentText',
                                                   recursive=True)

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        # title
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # name
        dataset['name'] = slugify(dataset['title'])
        dataset['publisher'] = publisher

        # description
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = str(
                soup_parser.body.find(class_='headersLevel1',
                                      recursive=True).string).strip()
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']
        # if after searching, description is still not satisfactorily set
        if dataset['notes'] is None or str(dataset['notes']).strip() == "":
            # set description to document title
            dataset['notes'] = dataset['title']

        # tags
        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']
        # date
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta',
                                    attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""
        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            try:
                resource['name'] = str(resource_link.find_parent(name='ul').\
                                find_previous_sibling(name=True))
            except:
                resource['name'] = str(resource_link.string).strip()
            resource['name'] += " " + str(
                resource_link.parent.contents[0]).strip()
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            resource['description'] = str(resource_link.\
                find_parent(class_='contentText').contents[0].string).strip()

            resource['description'] = re.sub(r'(</.+>)', '',
                                             resource['description'])
            resource['description'] = re.sub(r'(<.+>)', '',
                                             resource['description'])

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        if len(dataset['resources']) == 0:
            continue

        yield dataset
def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.find_all(id='maincontent',
                                                   recursive=True)

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url
        if dict(enumerate(
                container.find('div').find_all('div'))).get(0) is None:
            # get the first available div element
            dataset['title'] = str(container.find(name='div').\
                                string).strip()
        else:
            # get the 1st div element from the first avaialble div
            dataset['title'] = str(container.find('div').find_all('div')[0].\
                                    string).strip()

        if dataset['title'] is None or dataset['title'] == '':
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        # get publisher from parent package name
        dataset['publisher'] = __package__.split('.')[-2]
        if container.select_one('p') is not None:
            # get the first available p element
            dataset['notes'] = str(container.select_one('p').string).\
                                  strip()
        elif dict(enumerate(container.find_all('div'))).get(1) is not None:
            # get the 2nd div element
            dataset['notes'] = str(container.find_all('div')[1].\
                                    string).strip()
        else:
            # get the 2nd div element from the 1st avialble div element
            dataset['notes'] = str(container.\
                                    find('div').find_all('div')[1].\
                                    string).strip()
        # if no notes/description still not available (after best efforts),
        # default to dataset title
        if dataset['notes'] is None or dataset['notes'] == '':
            dataset['notes'] = dataset['title']

        dataset['tags'] = ''
        dataset['date'] = ''
        dataset['contact_person_name'] = ""
        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url,
                                url=resource_link['href'],
                                name=str(resource_link.string).strip())
            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        # if this dataset alread has data resource files look for
        # document resource files
        if len(dataset['resources']) > 0:
            yield dataset
Example #7
0
def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.find_all(class_='contentText',
                                                   recursive=True)
    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = str(
                soup_parser.body.find(class_='headersLevel1',
                                      recursive=True).string).strip()
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = soup_parser.body.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)

        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            for child in resource_link.parent.children:
                # check if the resource is contained in a table
                if resource_link.parent.name == 'td':
                    resource['name'] = str(
                        resource_link.find_parent(
                            name='tr').contents[1]).strip()
                else:
                    resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            if resource_link.find_parent(class_='contentText').\
                find_previous_sibling(class_='headersLevel2'):

                # concatenate the text content of parents with
                # class 'headersLevel1' & 'headersLevel2'
                resource['description'] = str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel1').\
                                                contents[0]).strip() +\
                                            " - " + str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel2').\
                                                contents[0]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
            else:
                # concatenate the text content of parents with
                # class 'headersLevel1' & 'contentText'
                resource['description'] = str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel1').\
                                                contents[0]).strip() +\
                                            " - " + str(resource_link.\
                                        find_parent(class_='contentText').\
                                                contents[0].string or resource_link.\
                                        find_parent(class_='contentText').\
                                                contents[0]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        yield dataset
Example #8
0
def parse(res):
    """ function parses content to create a dataset model
    or return None if no resource in content"""

    # ensure that the response text gotten is a string
    if not isinstance(getattr(res, 'text', None), str):
        return None

    try:
        soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')
    except:
        return None

    # check if the content contains any of the data extensions
    if soup_parser.body.find(name='a',
                             href=base_parser.resource_checker,
                             recursive=True) is None:
        # no resource on this page, so return None
        return None

    # if code gets here, at least one resource was found

    dataset_containers = soup_parser.body.find_all(name='div',
                                                   id='maincontent',
                                                   recursive=True)

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()

        # dataset source url
        dataset['source_url'] = res.url

        # dataset title
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # dataset name
        dataset['name'] = slugify(dataset['title'])

        # publisher
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta',
                                attrs={'name': 'ED.office'})['content']

        # description
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = dataset['title']
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta',
                                attrs={'name': 'DC.description'})['content']

        # tags
        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta',
                                attrs={'name': 'keywords'})['content']

        # date
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta',
                                    attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""
        dataset['contact_person_email'] = ""

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])

            # get the resource name iteratively
            for child in resource_link.parent.children:
                resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])


            if resource_link.find_parent(class_='contentText').\
                find_previous_sibling(class_='headersLevel2'):

                # concatenate the text content of parents with
                # class 'headersLevel1' & 'headersLevel2'
                resource['description'] = str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel1').\
                                                contents[0]).strip() +\
                                            " - " + str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel2').\
                                                contents[1]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
            else:
                # concatenate the text content of parents with
                # class 'headersLevel1' & 'contentText'
                resource['description'] = str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel1').\
                                                contents[0]).strip() +\
                                            " - " + str(resource_link.\
                                        find_parent(class_='contentText').\
                                                contents[0].string or resource_link.\
                                        find_parent(class_='contentText').\
                                                contents[0]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
            # after getting the best description possible, remove any " - "
            # and trailing white space
            resource['description'] = re.sub(r'^\s+\-\s+', '',
                                             resource['description'])
            resource['description'] = resource['description'].strip()

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        if len(dataset['resources']) == 0:
            continue

        yield dataset
Example #9
0
def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.find_all(class_='contentText',
                                                   recursive=True)
    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta',attrs={'name': 'DC.title'}) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(name='meta',
                                           attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'}) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']
        
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'}) is None:
            dataset['notes'] = str(soup_parser.body.find(class_='headersLevel1',
                                                     recursive=True).string).strip()
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']
        
        # if no notes/description still not available (after best efforts),
        # default to dataset title
        if dataset['notes'] is None or dataset['notes'] == '':
            dataset['notes'] = dataset['title']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'}) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']
    
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'}) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']
        
        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(name='a',
                                                 href=base_parser.resource_checker,
                                                 recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url,
                                url=resource_link['href'])
            resource['name'] = str(resource_link.find_parent(name='ul').\
                                find_previous_sibling(name=True))
            resource['name'] +=  " " + str(resource_link.parent.contents[0]).strip()
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            resource['description'] = str(resource_link.\
                find_parent(class_='contentText').contents[0].string).strip()

            resource['description'] = re.sub(r'(</.+>)', '', resource['description'])
            resource['description'] = re.sub(r'(<.+>)', '', resource['description'])

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)
        if len(dataset['resources']) == 0:
            continue

        yield dataset
Example #10
0
def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # ensure that the response text gotten is a string
    if not isinstance(getattr(res, 'text', None), str):
        return None

    try:
        soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')
    except:
        return None

    dataset_containers = soup_parser.body.select(
        '.container .content:not(.node-page)')

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = dataset['title']
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            # get the resource name iteratively
            for child in resource_link.parent.children:
                resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            # concatenate the content of resource link's parent 1st & 2nd child
            # class 'headersLevel1' & 'headersLevel2'
            resource['description'] = str(resource_link.\
                                           parent.contents[0]).strip() +\
                                        " - " + str(dict(enumerate(container.\
                                           contents)).get(1, '')).strip()

            resource['description'] = re.sub(r'(</.+>)', '',
                                             resource['description'])
            resource['description'] = re.sub(r'(<.+>)', '',
                                             resource['description'])

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        if len(dataset['resources']) == 0:
            continue

        yield dataset
def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.select('table')

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if (soup_parser.head.find(name='meta',attrs={'name': 'DC.title'}) is None)\
            or (soup_parser.head.find(name='meta',attrs={'name': 'DC.title'})['content']\
                is None or\
                    soup_parser.head.find(name='meta',attrs={'name': 'DC.title'})['content'] == ""):
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            # Use nces by default since this parser is used only when there is an `nces` class in the page
            dataset['publisher'] = 'nces'
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = dataset['title']
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            # get the resource name
            if soup_parser.find(name='th', class_='title',
                                recursive=True) is not None:
                resource['name'] = str(
                    soup_parser.find(name='th', class_='title',
                                     recursive=True))
            elif soup_parser.body.\
                                    find(name='div', class_='title') is not None:
                resource['name'] = str(soup_parser.body.\
                                    find(name='div', class_='title').string).strip()
            else:
                # get the resource name iteratively
                for child in resource_link.parent.children:
                    resource['name'] = str(child).strip()
                    if re.sub(r'(<.+>)', '',
                              re.sub(r'(</.+>)', '', resource['name'])) != "":
                        break
            # remove any html tags from the resource name
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<[a-z]+/>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])
            resource['name'] = resource['name'].strip()

            # the page structure has NO description available for resources
            resource['description'] = ''

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        # check if created dataset has resources attached.
        if len(dataset['resources']) == 0:  # no resources so don't yield it
            continue  # skip this loop

        yield dataset
Example #12
0
def parse(res, publisher) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.find_all(name='body')

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title')).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        dataset['publisher'] = publisher

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = dataset['title']
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            # get the resource name iteratively
            for child in resource_link.parent.children:
                if resource_link.parent.name == 'td':
                    resource['name'] = str(
                        resource_link.find_parent(
                            name='tr').contents[1]).strip()
                else:
                    resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            if resource_link.parent.parent.find(name=True):

                # concatenate the text content of parents with
                # resource name
                resource['description'] = str(resource_link.parent.parent.find(name=True)).strip() +\
                                            " - " + str(resource['name']).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'^\s+\-\s+', '',
                                                 resource['description'])
            else:
                # use the resource name for description
                resource['description'] = str(resource['name']).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
            # after getting the best description possible, strip any white space
            resource['description'] = resource['description'].strip()

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)
        if len(dataset['resources']) == 0:
            continue

        yield dataset
Example #13
0
def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # ensure that the response text gotten is a string
    if not isinstance(getattr(res, 'text', None), str):
        return None

    try:
        soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')
    except:
        return None

    dataset_containers = soup_parser.body.find_all(name='div',
                                                   class_='container',
                                                   recursive=True)

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = dataset['title']
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)

        # hold the list of resource names collected strictly by traversing resource parent
        traverse_parent_unique_resource_names = list()

        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            # get the resource name iteratively
            for child in resource_link.parent.children:
                resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            # to ensure that the same name is not repeated for a resource when using parental traversal,
            # check if the retrieved name has been collected and assigned before
            if resource['name'] in traverse_parent_unique_resource_names:
                # the retrieved resource name has already been assigned to another resource
                # then retrieve the content of the 'a' tag as the name
                resource['name'] = " ".join(
                    list(
                        map(lambda string: str(string),
                            resource_link.stripped_strings)))
            else:
                # since resource name was retrieved by traversing parent,
                # add resource name to the list
                traverse_parent_unique_resource_names.append(resource['name'])

            if resource_link.find_parent(name='p'):

                resource['description'] = str(resource_link.\
                                        find_parent(name='p').
                                                contents[0]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
            else:  # set description to name of resource
                resource['description'] = resource['name']

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        if len(dataset['resources']) == 0:
            continue

        yield dataset
Example #14
0
def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # ensure that the response text gotten is a string
    if not isinstance(getattr(res, 'text', None), str):
        return None

    # create parser object
    try:
        soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')
    except:
        return None

    dataset_containers = soup_parser.body.find_all(class_='contentText',
                                                   recursive=True)

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']

        try:
            if soup_parser.head.find(name='meta',
                                     attrs={'name': 'DC.description'}) is None:
                dataset['notes'] = str(
                    soup_parser.body.find(class_='headersLevel1',
                                          recursive=True).string).strip()
            else:
                dataset['notes'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.description'})['content']
        except:
            dataset['notes'] = dataset['title']

        # if despite best efforts 'notes' is still empty or None
        if not dataset.get('notes', None):
            dataset['notes'] = dataset['title']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = soup_parser.body.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)

        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            for child in resource_link.parent.children:
                # check if the resource is contained in a table
                if resource_link.parent.name == 'td':
                    resource['name'] = str(
                        resource_link.find_parent(
                            name='tr').contents[1]).strip()
                else:
                    resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            if resource_link.find_parent(class_='contentText').\
                find_previous_sibling(class_='headersLevel2'):

                # concatenate the text content of parents with
                # class 'headersLevel1' & 'headersLevel2'
                resource['description'] = str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel1').\
                                                contents[0]).strip() +\
                                            " - " + str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel2').\
                                                contents[0]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
            else:
                # concatenate the text content of parents with
                # class 'headersLevel1' & 'contentText'
                resource['description'] = str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel1').\
                                                contents[0]).strip() +\
                                            " - " + str(resource_link.\
                                        find_parent(class_='contentText').\
                                                contents[0].string or resource_link.\
                                        find_parent(class_='contentText').\
                                                contents[0]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])

            # after getting the best description possible, remove any " - "
            # and trailing white space
            resource['description'] = re.sub(r'^\s+\-\s+', '',
                                             resource.get('description', ''))
            resource['description'] = resource['description'].strip()

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        if len(dataset['resources']) == 0:
            continue

        yield dataset
Example #15
0
def parse(res, publisher) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.find_all(name='body')

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        dataset['title'] = soup_parser.select(
            'div.MainContent > div.IndicatorList > h4 > a')[0].string

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        dataset['publisher'] = publisher

        dataset['notes'] = dataset['title']

        if soup_parser.find(name='div', attrs={'class': 'ReportSource'}):
            report_source = soup_parser.find(name='div',
                                             attrs={'class': 'ReportSource'})
            dataset[
                'notes'] = f"{dataset['notes']}<br>\n{report_source.string}"

        if 'statedetail.aspx' in res.url:
            dataset[
                'notes'] = f"{dataset['notes']}<br>\n<a href='{res.url.replace('statedetail', 'moreinfo')}'>More Info</a>"

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            file_name = resource_link['href'].split('.')[-2]
            resource['name'] = h.unslugify(file_name.split('/')[-1].strip())

            try:
                resource['description'] = report_source.string
            except:
                resource['description'] = dataset['title']

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)
        if len(dataset['resources']) == 0:
            continue

        yield dataset
def parse(res, publisher) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.find_all(name='div',
                                                   id='page',
                                                   recursive=True)

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    # print(dataset_containers)

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        # import ipdb; ipdb.set_trace()
        try:
            dataset['title'] = str(
                container.find(class_='site-title').string).strip()
        except:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        # get publisher from parent package name
        dataset['publisher'] = publisher

        try:
            dataset['notes'] = str(container.find(_class='site-description').string).\
                                  strip()
        except:
            dataset['notes'] = dataset['title']

        dataset['tags'] = ''
        dataset['date'] = ''
        dataset['contact_person_name'] = ""
        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url,
                                url=resource_link['href'],
                                name=str(resource_link.string).strip())

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        # if this dataset alread has data resource files look for
        # document resource files
        if len(dataset['resources']) > 0:
            yield dataset
Example #17
0
def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.select('div.MainContent')
    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if (soup_parser.head.find(name='meta',attrs={'name': 'DC.title'}) is None)\
            or (soup_parser.head.find(name='meta',attrs={'name': 'DC.title'})['content']\
                is None or\
                    soup_parser.head.find(name='meta',attrs={'name': 'DC.title'})['content'] == ""):
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = dataset['title']
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            # get the resource name iteratively
            for child in resource_link.parent.children:
                resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])
            # the page structure has NO description available for resources
            resource['description'] = ''

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        # check if created dataset has resources attached.
        if len(dataset['resources']) == 0:  # no resources so don't yield it
            continue  # skip this loop

        yield dataset