def parse(res, publisher):
    """ function parses content to create a dataset model """

    # ensure that the response text gotten is a string
    if not isinstance(getattr(res, 'text', None), str):
        return None

    # create parser object
    try:
        soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')
    except:
        return None

    dataset_containers = soup_parser.body.find_all(class_='contentText',
                                                   recursive=True)

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        # title
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # name
        dataset['name'] = slugify(dataset['title'])
        dataset['publisher'] = publisher

        # description
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = str(
                soup_parser.body.find(class_='headersLevel1',
                                      recursive=True).string).strip()
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']
        # if after searching, description is still not satisfactorily set
        if dataset['notes'] is None or str(dataset['notes']).strip() == "":
            # set description to document title
            dataset['notes'] = dataset['title']

        # tags
        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']
        # date
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta',
                                    attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""
        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            try:
                resource['name'] = str(resource_link.find_parent(name='ul').\
                                find_previous_sibling(name=True))
            except:
                resource['name'] = str(resource_link.string).strip()
            resource['name'] += " " + str(
                resource_link.parent.contents[0]).strip()
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            resource['description'] = str(resource_link.\
                find_parent(class_='contentText').contents[0].string).strip()

            resource['description'] = re.sub(r'(</.+>)', '',
                                             resource['description'])
            resource['description'] = re.sub(r'(<.+>)', '',
                                             resource['description'])

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        if len(dataset['resources']) == 0:
            continue

        yield dataset
Example #2
0
def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # ensure that the response text gotten is a string
    if not isinstance(getattr(res, 'text', None), str):
        return None

    try:
        soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')
    except:
        return None

    dataset_containers = soup_parser.body.select(
        '.container .content:not(.node-page)')

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = dataset['title']
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            # get the resource name iteratively
            for child in resource_link.parent.children:
                resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            # concatenate the content of resource link's parent 1st & 2nd child
            # class 'headersLevel1' & 'headersLevel2'
            resource['description'] = str(resource_link.\
                                           parent.contents[0]).strip() +\
                                        " - " + str(dict(enumerate(container.\
                                           contents)).get(1, '')).strip()

            resource['description'] = re.sub(r'(</.+>)', '',
                                             resource['description'])
            resource['description'] = re.sub(r'(<.+>)', '',
                                             resource['description'])

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        if len(dataset['resources']) == 0:
            continue

        yield dataset
Example #3
0
def parse(res):
    """ function parses content to create a dataset model
    or return None if no resource in content"""

    # ensure that the response text gotten is a string
    if not isinstance(getattr(res, 'text', None), str):
        return None

    try:
        soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')
    except:
        return None

    # check if the content contains any of the extensions
    if soup_parser.body.find(name='a',
                             href=base_parser.resource_checker,
                             recursive=True):
        pass
    elif soup_parser.body.find(name='select', recursive=True):
        pass
    else:
        # no resource on this page, so return None
        return None
    # if code gets here, at least one resource was found

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    # create dataset model dict
    dataset = Dataset()
    # create the collection (with a source)
    collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))
    # specify the collection which the dataset belongs to
    if collection:  # if collection exist
        dataset['collection'] = collection
    dataset['source_url'] = res.url

    dataset['title'] = h.get_meta_value(soup_parser, 'og:title') or \
        str(soup_parser.head.find(name='title').string).strip()

    # replace all non-word characters (e.g. ?/) with '-'
    # also remove site title from the page title
    dataset['name'] = slugify(dataset['title'].split('|')[0])

    dataset['publisher'] = h.get_meta_value(soup_parser, 'og:site_name') or \
        __package__.split('.')[-1]

    dataset['notes'] = h.get_meta_value(soup_parser, 'og:description') or \
        h.get_meta_value(soup_parser, 'description') or ''

    dataset['date'] = h.get_meta_value(soup_parser, 'article:published_time') or \
        h.get_meta_value(soup_parser, 'article:modified_time') or \
        h.get_meta_value(soup_parser, 'og:updated_time') or ''
    if dataset['date']:
        dataset['date'] = parser.parse(dataset['date']).strftime('%Y-%m-%d')

    dataset['contact_person_name'] = ''
    dataset['contact_person_email'] = ''

    dataset['resources'] = list()

    # get option tags that have 'value' attribute linking to a resource file
    option_tags = soup_parser.find_all(name='option',
                                       value=base_parser.resource_checker,
                                       recursive=True)

    for option_tag in option_tags:

        option_value = option_tag['value']
        # if it has options with URLs as values, then engage parser2
        if option_value.startswith(('http', '../', './', '/')):
            return parsers.parser2.parse(res, soup_parser, dataset)

    # run parser1, since parser2 was not activated until this point
    return parsers.parser1.parse(res, soup_parser, dataset)
def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.select('table')

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if (soup_parser.head.find(name='meta',attrs={'name': 'DC.title'}) is None)\
            or (soup_parser.head.find(name='meta',attrs={'name': 'DC.title'})['content']\
                is None or\
                    soup_parser.head.find(name='meta',attrs={'name': 'DC.title'})['content'] == ""):
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            # Use nces by default since this parser is used only when there is an `nces` class in the page
            dataset['publisher'] = 'nces'
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = dataset['title']
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            # get the resource name
            if soup_parser.find(name='th', class_='title',
                                recursive=True) is not None:
                resource['name'] = str(
                    soup_parser.find(name='th', class_='title',
                                     recursive=True))
            elif soup_parser.body.\
                                    find(name='div', class_='title') is not None:
                resource['name'] = str(soup_parser.body.\
                                    find(name='div', class_='title').string).strip()
            else:
                # get the resource name iteratively
                for child in resource_link.parent.children:
                    resource['name'] = str(child).strip()
                    if re.sub(r'(<.+>)', '',
                              re.sub(r'(</.+>)', '', resource['name'])) != "":
                        break
            # remove any html tags from the resource name
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<[a-z]+/>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])
            resource['name'] = resource['name'].strip()

            # the page structure has NO description available for resources
            resource['description'] = ''

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        # check if created dataset has resources attached.
        if len(dataset['resources']) == 0:  # no resources so don't yield it
            continue  # skip this loop

        yield dataset
Example #5
0
def parse(res, publisher) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.find_all(name='body')

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title')).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        dataset['publisher'] = publisher

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = dataset['title']
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            # get the resource name iteratively
            for child in resource_link.parent.children:
                if resource_link.parent.name == 'td':
                    resource['name'] = str(
                        resource_link.find_parent(
                            name='tr').contents[1]).strip()
                else:
                    resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            if resource_link.parent.parent.find(name=True):

                # concatenate the text content of parents with
                # resource name
                resource['description'] = str(resource_link.parent.parent.find(name=True)).strip() +\
                                            " - " + str(resource['name']).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'^\s+\-\s+', '',
                                                 resource['description'])
            else:
                # use the resource name for description
                resource['description'] = str(resource['name']).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
            # after getting the best description possible, strip any white space
            resource['description'] = resource['description'].strip()

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)
        if len(dataset['resources']) == 0:
            continue

        yield dataset
Example #6
0
def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # ensure that the response text gotten is a string
    if not isinstance(getattr(res, 'text', None), str):
        return None

    # create parser object
    try:
        soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')
    except:
        return None

    dataset_containers = soup_parser.body.find_all(class_='contentText',
                                                   recursive=True)

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']

        try:
            if soup_parser.head.find(name='meta',
                                     attrs={'name': 'DC.description'}) is None:
                dataset['notes'] = str(
                    soup_parser.body.find(class_='headersLevel1',
                                          recursive=True).string).strip()
            else:
                dataset['notes'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.description'})['content']
        except:
            dataset['notes'] = dataset['title']

        # if despite best efforts 'notes' is still empty or None
        if not dataset.get('notes', None):
            dataset['notes'] = dataset['title']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = soup_parser.body.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)

        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            for child in resource_link.parent.children:
                # check if the resource is contained in a table
                if resource_link.parent.name == 'td':
                    resource['name'] = str(
                        resource_link.find_parent(
                            name='tr').contents[1]).strip()
                else:
                    resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            if resource_link.find_parent(class_='contentText').\
                find_previous_sibling(class_='headersLevel2'):

                # concatenate the text content of parents with
                # class 'headersLevel1' & 'headersLevel2'
                resource['description'] = str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel1').\
                                                contents[0]).strip() +\
                                            " - " + str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel2').\
                                                contents[0]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
            else:
                # concatenate the text content of parents with
                # class 'headersLevel1' & 'contentText'
                resource['description'] = str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel1').\
                                                contents[0]).strip() +\
                                            " - " + str(resource_link.\
                                        find_parent(class_='contentText').\
                                                contents[0].string or resource_link.\
                                        find_parent(class_='contentText').\
                                                contents[0]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])

            # after getting the best description possible, remove any " - "
            # and trailing white space
            resource['description'] = re.sub(r'^\s+\-\s+', '',
                                             resource.get('description', ''))
            resource['description'] = resource['description'].strip()

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        if len(dataset['resources']) == 0:
            continue

        yield dataset
Example #7
0
def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.find_all(class_='contentText',
                                                   recursive=True)
    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0: # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''), 
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta',attrs={'name': 'DC.title'}) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(name='meta',
                                           attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'}) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']
        
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'}) is None:
            dataset['notes'] = str(soup_parser.body.find(class_='headersLevel1',
                                                     recursive=True).string).strip()
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']
        
        # if no notes/description still not available (after best efforts),
        # default to dataset title
        if dataset['notes'] is None or dataset['notes'] == '':
            dataset['notes'] = dataset['title']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'}) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']
    
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'}) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']
        
        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection: # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(name='a',
                                                 href=base_parser.resource_checker,
                                                 recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url,
                                url=resource_link['href'])
            resource['name'] = str(resource_link.find_parent(name='ul').\
                                find_previous_sibling(name=True))
            resource['name'] +=  " " + str(resource_link.parent.contents[0]).strip()
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            resource['description'] = str(resource_link.\
                find_parent(class_='contentText').contents[0].string).strip()

            resource['description'] = re.sub(r'(</.+>)', '', resource['description'])
            resource['description'] = re.sub(r'(<.+>)', '', resource['description'])

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)
        if len(dataset['resources']) == 0:
            continue

        yield dataset
Example #8
0
def parse(res, publisher) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.find_all(name='body')

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        dataset['title'] = soup_parser.select(
            'div.MainContent > div.IndicatorList > h4 > a')[0].string

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        dataset['publisher'] = publisher

        dataset['notes'] = dataset['title']

        if soup_parser.find(name='div', attrs={'class': 'ReportSource'}):
            report_source = soup_parser.find(name='div',
                                             attrs={'class': 'ReportSource'})
            dataset[
                'notes'] = f"{dataset['notes']}<br>\n{report_source.string}"

        if 'statedetail.aspx' in res.url:
            dataset[
                'notes'] = f"{dataset['notes']}<br>\n<a href='{res.url.replace('statedetail', 'moreinfo')}'>More Info</a>"

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            file_name = resource_link['href'].split('.')[-2]
            resource['name'] = h.unslugify(file_name.split('/')[-1].strip())

            try:
                resource['description'] = report_source.string
            except:
                resource['description'] = dataset['title']

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)
        if len(dataset['resources']) == 0:
            continue

        yield dataset
Example #9
0
def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.find_all(class_='accordiontitle', recursive=True)

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0: # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''), 
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        dataset['title'] = str(container.find(class_='accordionheader').\
                            string).strip()
        if dataset['title'] is None or dataset['title'] == '':
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        # get publisher from parent package name
        dataset['publisher'] = __package__.split('.')[-2]

        dataset['notes'] = str(container.find(name='p').string).\
                                  strip()
        # if no notes/description available, default to dataset title
        if dataset['notes'] is None or dataset['notes'] == '':
            dataset['notes'] = dataset['title']

        dataset['tags'] = ''
        dataset['date'] = ''
        dataset['contact_person_name'] = ""
        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection: # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(name='a',
                                                 href=base_parser.resource_checker,
                                                 recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url = res.url,
                                url = resource_link['href'],
                                name = str(resource_link.string).strip())

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        # if this dataset alread has data resource files look for
        # document resource files
        if len(dataset['resources']) > 0:
            yield dataset
Example #10
0
def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # ensure that the response text gotten is a string
    if not isinstance(getattr(res, 'text', None), str):
        return None

    try:
        soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')
    except:
        return None

    dataset_containers = soup_parser.body.find_all(name='div',
                                                   class_='container',
                                                   recursive=True)

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = dataset['title']
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)

        # hold the list of resource names collected strictly by traversing resource parent
        traverse_parent_unique_resource_names = list()

        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            # get the resource name iteratively
            for child in resource_link.parent.children:
                resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            # to ensure that the same name is not repeated for a resource when using parental traversal,
            # check if the retrieved name has been collected and assigned before
            if resource['name'] in traverse_parent_unique_resource_names:
                # the retrieved resource name has already been assigned to another resource
                # then retrieve the content of the 'a' tag as the name
                resource['name'] = " ".join(
                    list(
                        map(lambda string: str(string),
                            resource_link.stripped_strings)))
            else:
                # since resource name was retrieved by traversing parent,
                # add resource name to the list
                traverse_parent_unique_resource_names.append(resource['name'])

            if resource_link.find_parent(name='p'):

                resource['description'] = str(resource_link.\
                                        find_parent(name='p').
                                                contents[0]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
            else:  # set description to name of resource
                resource['description'] = resource['name']

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        if len(dataset['resources']) == 0:
            continue

        yield dataset