Python Resource Examples

Programming Language: Python

Namespace/Package Name: edscrapers.scrapers.base.models

Class/Type: Resource

Examples at hotexamples.com: 17

Python Resource - 17 examples found. These are the top rated real world Python examples of edscrapers.scrapers.base.models.Resource extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Resource(17)

get(1)

Frequently Used Methods

Resource (17)

get (1)

Example #1

Show file

def parse(res, container, dataset) -> dict:
    """ function parses content to create a dataset model """

    # add  resources from the 'container' to the dataset
    page_resource_links = container.find_all(name='option',
                                             value=base_parser.resource_checker,
                                             recursive=True)
    for resource_link in page_resource_links:
        resource = Resource(source_url=res.url,
                            url=resource_link['value'])
        # get the resource name
        resource['name'] = str(resource_link.text).strip()

        # get the format of the resource from the file extension of the link
        resource_format = resource_link['value']\
                        [resource_link['value'].rfind('.') + 1:]
        resource['format'] = resource_format

        # Add header information to resource object
        resource['headers'] = h.get_resource_headers(res.url, resource_link['value'])

        # add the resource to collection of resources
        dataset['resources'].append(resource)

    if len(dataset['resources']) == 0: # if no resources were found
        return None

    yield dataset

Example #2

Show file

File: ocr_state_national_estimates_parser1.py Project: nightsh/edscrapers

def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.find_all(class_='accordiontitle', recursive=True)
    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        dataset['title'] = str(container.find(class_='accordionheader').\
                            string).strip()
        if dataset['title'] is None or dataset['title'] == '':
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        # get publisher from parent package name
        dataset['publisher'] = dataset['publisher'] = __package__.split('.')[-2]

        dataset['notes'] = str(container.find(name='p').string).\
                                  strip()
        # if no notes/description available, default to dataset title
        if dataset['notes'] is None or dataset['notes'] == '':
            dataset['notes'] = dataset['title']

        dataset['tags'] = ''
        dataset['date'] = ''
        dataset['contact_person_name'] = ""
        dataset['contact_person_email'] = ""
        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(name='a',
                                                 href=base_parser.resource_checker,
                                                 recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url = res.url,
                                url = resource_link['href'],
                                name = str(resource_link.string).strip())

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        # if this dataset alread has data resource files look for
        # document resource files
        if len(dataset['resources']) > 0:
            yield dataset

Example #3

Show file

File: fsa_parser1.py Project: tanvirchahal/edscrapers

def parse(res, container, dataset) -> dict:
    """ function parses content to create a dataset model """

    # add  resources from the 'container' to the dataset
    page_resource_links = container.find_all(name='a',
                                             href=base_parser.resource_checker,
                                             recursive=True)
    for resource_link in page_resource_links:
        resource = Resource(source_url=res.url, url=resource_link['href'])
        # get the resource name iteratively
        for child in resource_link.parent.children:
            if resource_link.parent.name == 'td':
                resource['name'] = str(
                    resource_link.find_parent(name='tr').contents[1]).strip()
            else:
                resource['name'] = str(child).strip()
            if re.sub(r'(<.+>)', '', re.sub(r'(</.+>)', '',
                                            resource['name'])) != "":
                break
        resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
        resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

        if resource_link.parent.parent.find(name=True):

            # concatenate the text content of parents with
            # class 'headersLevel1' & 'headersLevel2'
            resource['description'] = str(resource_link.parent.parent.find(name=True)).strip() +\
                                        " - " + str(resource['name']).strip()
            resource['description'] = re.sub(r'(</.+>)', '',
                                             resource['description'])
            resource['description'] = re.sub(r'(<.+>)', '',
                                             resource['description'])
        else:
            # concatenate the text content of parents with
            # class 'headersLevel1' & 'contentText'
            resource['description'] = str(resource['name']).strip()
            resource['description'] = re.sub(r'(</.+>)', '',
                                             resource['description'])
            resource['description'] = re.sub(r'(<.+>)', '',
                                             resource['description'])

        # get the format of the resource from the file extension of the link
        resource_format = resource_link['href']\
                        [resource_link['href'].rfind('.') + 1:]
        resource['format'] = resource_format

        # Add header information to resource object
        resource['headers'] = h.get_resource_headers(res.url,
                                                     resource_link['href'])

        # add the resource to collection of resources
        dataset['resources'].append(resource)

    if len(dataset['resources']) == 0:  # if no resources were found
        return None

    yield dataset

Example #4

Show file

def get_all_resources(res, dataset, extensions, deny_list=[]):
    for link in LxmlLinkExtractor(deny_extensions=[],
                                  deny=deny_list).extract_links(res):
        for extension in extensions.keys():
            if link.url.endswith(extension):
                resource = Resource(
                    source_url=res.url,
                    url=link.url,
                    name=link.text,
                )
                dataset['resources'].append(resource)

Example #5

Show file

File: osers_parser2.py Project: osahon-okungbowa/edscrapers

def parse(res, publisher):
    """ function parses content to create a dataset model """

    # ensure that the response text gotten is a string
    if not isinstance(getattr(res, 'text', None), str):
        return None

    # create parser object
    try:
        soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')
    except:
        return None

    dataset_containers = soup_parser.body.find_all(class_='contentText',
                                                   recursive=True)

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        # title
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # name
        dataset['name'] = slugify(dataset['title'])
        dataset['publisher'] = publisher

        # description
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = str(
                soup_parser.body.find(class_='headersLevel1',
                                      recursive=True).string).strip()
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']
        # if after searching, description is still not satisfactorily set
        if dataset['notes'] is None or str(dataset['notes']).strip() == "":
            # set description to document title
            dataset['notes'] = dataset['title']

        # tags
        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']
        # date
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta',
                                    attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""
        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            try:
                resource['name'] = str(resource_link.find_parent(name='ul').\
                                find_previous_sibling(name=True))
            except:
                resource['name'] = str(resource_link.string).strip()
            resource['name'] += " " + str(
                resource_link.parent.contents[0]).strip()
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            resource['description'] = str(resource_link.\
                find_parent(class_='contentText').contents[0].string).strip()

            resource['description'] = re.sub(r'(</.+>)', '',
                                             resource['description'])
            resource['description'] = re.sub(r'(<.+>)', '',
                                             resource['description'])

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        if len(dataset['resources']) == 0:
            continue

        yield dataset

Example #6

Show file

File: ocr_state_national_estimates_parser2.py Project: tanvirchahal/edscrapers

def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.find_all(id='maincontent',
                                                   recursive=True)

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url
        if dict(enumerate(
                container.find('div').find_all('div'))).get(0) is None:
            # get the first available div element
            dataset['title'] = str(container.find(name='div').\
                                string).strip()
        else:
            # get the 1st div element from the first avaialble div
            dataset['title'] = str(container.find('div').find_all('div')[0].\
                                    string).strip()

        if dataset['title'] is None or dataset['title'] == '':
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        # get publisher from parent package name
        dataset['publisher'] = __package__.split('.')[-2]
        if container.select_one('p') is not None:
            # get the first available p element
            dataset['notes'] = str(container.select_one('p').string).\
                                  strip()
        elif dict(enumerate(container.find_all('div'))).get(1) is not None:
            # get the 2nd div element
            dataset['notes'] = str(container.find_all('div')[1].\
                                    string).strip()
        else:
            # get the 2nd div element from the 1st avialble div element
            dataset['notes'] = str(container.\
                                    find('div').find_all('div')[1].\
                                    string).strip()
        # if no notes/description still not available (after best efforts),
        # default to dataset title
        if dataset['notes'] is None or dataset['notes'] == '':
            dataset['notes'] = dataset['title']

        dataset['tags'] = ''
        dataset['date'] = ''
        dataset['contact_person_name'] = ""
        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url,
                                url=resource_link['href'],
                                name=str(resource_link.string).strip())
            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        # if this dataset alread has data resource files look for
        # document resource files
        if len(dataset['resources']) > 0:
            yield dataset

Example #7

Show file

def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.find_all(class_='contentText',
                                                   recursive=True)
    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = str(
                soup_parser.body.find(class_='headersLevel1',
                                      recursive=True).string).strip()
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = soup_parser.body.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)

        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            for child in resource_link.parent.children:
                # check if the resource is contained in a table
                if resource_link.parent.name == 'td':
                    resource['name'] = str(
                        resource_link.find_parent(
                            name='tr').contents[1]).strip()
                else:
                    resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            if resource_link.find_parent(class_='contentText').\
                find_previous_sibling(class_='headersLevel2'):

                # concatenate the text content of parents with
                # class 'headersLevel1' & 'headersLevel2'
                resource['description'] = str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel1').\
                                                contents[0]).strip() +\
                                            " - " + str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel2').\
                                                contents[0]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
            else:
                # concatenate the text content of parents with
                # class 'headersLevel1' & 'contentText'
                resource['description'] = str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel1').\
                                                contents[0]).strip() +\
                                            " - " + str(resource_link.\
                                        find_parent(class_='contentText').\
                                                contents[0].string or resource_link.\
                                        find_parent(class_='contentText').\
                                                contents[0]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        yield dataset

Example #8

Show file

File: osers_parser1.py Project: nightsh/edscrapers

def parse(res):
    """ function parses content to create a dataset model
    or return None if no resource in content"""

    # ensure that the response text gotten is a string
    if not isinstance(getattr(res, 'text', None), str):
        return None

    try:
        soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')
    except:
        return None

    # check if the content contains any of the data extensions
    if soup_parser.body.find(name='a',
                             href=base_parser.resource_checker,
                             recursive=True) is None:
        # no resource on this page, so return None
        return None

    # if code gets here, at least one resource was found

    dataset_containers = soup_parser.body.find_all(name='div',
                                                   id='maincontent',
                                                   recursive=True)

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()

        # dataset source url
        dataset['source_url'] = res.url

        # dataset title
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # dataset name
        dataset['name'] = slugify(dataset['title'])

        # publisher
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta',
                                attrs={'name': 'ED.office'})['content']

        # description
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = dataset['title']
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta',
                                attrs={'name': 'DC.description'})['content']

        # tags
        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta',
                                attrs={'name': 'keywords'})['content']

        # date
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta',
                                    attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""
        dataset['contact_person_email'] = ""

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])

            # get the resource name iteratively
            for child in resource_link.parent.children:
                resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])


            if resource_link.find_parent(class_='contentText').\
                find_previous_sibling(class_='headersLevel2'):

                # concatenate the text content of parents with
                # class 'headersLevel1' & 'headersLevel2'
                resource['description'] = str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel1').\
                                                contents[0]).strip() +\
                                            " - " + str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel2').\
                                                contents[1]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
            else:
                # concatenate the text content of parents with
                # class 'headersLevel1' & 'contentText'
                resource['description'] = str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel1').\
                                                contents[0]).strip() +\
                                            " - " + str(resource_link.\
                                        find_parent(class_='contentText').\
                                                contents[0].string or resource_link.\
                                        find_parent(class_='contentText').\
                                                contents[0]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
            # after getting the best description possible, remove any " - "
            # and trailing white space
            resource['description'] = re.sub(r'^\s+\-\s+', '',
                                             resource['description'])
            resource['description'] = resource['description'].strip()

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        if len(dataset['resources']) == 0:
            continue

        yield dataset

Example #9

Show file

File: octae_parser2.py Project: nightsh/edscrapers

def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.find_all(class_='contentText',
                                                   recursive=True)
    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta',attrs={'name': 'DC.title'}) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(name='meta',
                                           attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'}) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']
        
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'}) is None:
            dataset['notes'] = str(soup_parser.body.find(class_='headersLevel1',
                                                     recursive=True).string).strip()
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']
        
        # if no notes/description still not available (after best efforts),
        # default to dataset title
        if dataset['notes'] is None or dataset['notes'] == '':
            dataset['notes'] = dataset['title']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'}) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']
    
        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'}) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']
        
        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(name='a',
                                                 href=base_parser.resource_checker,
                                                 recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url,
                                url=resource_link['href'])
            resource['name'] = str(resource_link.find_parent(name='ul').\
                                find_previous_sibling(name=True))
            resource['name'] +=  " " + str(resource_link.parent.contents[0]).strip()
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            resource['description'] = str(resource_link.\
                find_parent(class_='contentText').contents[0].string).strip()

            resource['description'] = re.sub(r'(</.+>)', '', resource['description'])
            resource['description'] = re.sub(r'(<.+>)', '', resource['description'])

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)
        if len(dataset['resources']) == 0:
            continue

        yield dataset

Example #10

Show file

def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # ensure that the response text gotten is a string
    if not isinstance(getattr(res, 'text', None), str):
        return None

    try:
        soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')
    except:
        return None

    dataset_containers = soup_parser.body.select(
        '.container .content:not(.node-page)')

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = dataset['title']
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            # get the resource name iteratively
            for child in resource_link.parent.children:
                resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            # concatenate the content of resource link's parent 1st & 2nd child
            # class 'headersLevel1' & 'headersLevel2'
            resource['description'] = str(resource_link.\
                                           parent.contents[0]).strip() +\
                                        " - " + str(dict(enumerate(container.\
                                           contents)).get(1, '')).strip()

            resource['description'] = re.sub(r'(</.+>)', '',
                                             resource['description'])
            resource['description'] = re.sub(r'(<.+>)', '',
                                             resource['description'])

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        if len(dataset['resources']) == 0:
            continue

        yield dataset

Example #11

Show file

File: nces_parser.py Project: osahon-okungbowa/edscrapers

def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.select('table')

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if (soup_parser.head.find(name='meta',attrs={'name': 'DC.title'}) is None)\
            or (soup_parser.head.find(name='meta',attrs={'name': 'DC.title'})['content']\
                is None or\
                    soup_parser.head.find(name='meta',attrs={'name': 'DC.title'})['content'] == ""):
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            # Use nces by default since this parser is used only when there is an `nces` class in the page
            dataset['publisher'] = 'nces'
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = dataset['title']
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            # get the resource name
            if soup_parser.find(name='th', class_='title',
                                recursive=True) is not None:
                resource['name'] = str(
                    soup_parser.find(name='th', class_='title',
                                     recursive=True))
            elif soup_parser.body.\
                                    find(name='div', class_='title') is not None:
                resource['name'] = str(soup_parser.body.\
                                    find(name='div', class_='title').string).strip()
            else:
                # get the resource name iteratively
                for child in resource_link.parent.children:
                    resource['name'] = str(child).strip()
                    if re.sub(r'(<.+>)', '',
                              re.sub(r'(</.+>)', '', resource['name'])) != "":
                        break
            # remove any html tags from the resource name
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<[a-z]+/>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])
            resource['name'] = resource['name'].strip()

            # the page structure has NO description available for resources
            resource['description'] = ''

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        # check if created dataset has resources attached.
        if len(dataset['resources']) == 0:  # no resources so don't yield it
            continue  # skip this loop

        yield dataset

Example #12

Show file

def parse(res, publisher) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.find_all(name='body')

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title')).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        dataset['publisher'] = publisher

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = dataset['title']
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            # get the resource name iteratively
            for child in resource_link.parent.children:
                if resource_link.parent.name == 'td':
                    resource['name'] = str(
                        resource_link.find_parent(
                            name='tr').contents[1]).strip()
                else:
                    resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            if resource_link.parent.parent.find(name=True):

                # concatenate the text content of parents with
                # resource name
                resource['description'] = str(resource_link.parent.parent.find(name=True)).strip() +\
                                            " - " + str(resource['name']).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'^\s+\-\s+', '',
                                                 resource['description'])
            else:
                # use the resource name for description
                resource['description'] = str(resource['name']).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
            # after getting the best description possible, strip any white space
            resource['description'] = resource['description'].strip()

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)
        if len(dataset['resources']) == 0:
            continue

        yield dataset

Example #13

Show file

File: oese_parser1.py Project: tanvirchahal/edscrapers

def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # ensure that the response text gotten is a string
    if not isinstance(getattr(res, 'text', None), str):
        return None

    try:
        soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')
    except:
        return None

    dataset_containers = soup_parser.body.find_all(name='div',
                                                   class_='container',
                                                   recursive=True)

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = dataset['title']
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)

        # hold the list of resource names collected strictly by traversing resource parent
        traverse_parent_unique_resource_names = list()

        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            # get the resource name iteratively
            for child in resource_link.parent.children:
                resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            # to ensure that the same name is not repeated for a resource when using parental traversal,
            # check if the retrieved name has been collected and assigned before
            if resource['name'] in traverse_parent_unique_resource_names:
                # the retrieved resource name has already been assigned to another resource
                # then retrieve the content of the 'a' tag as the name
                resource['name'] = " ".join(
                    list(
                        map(lambda string: str(string),
                            resource_link.stripped_strings)))
            else:
                # since resource name was retrieved by traversing parent,
                # add resource name to the list
                traverse_parent_unique_resource_names.append(resource['name'])

            if resource_link.find_parent(name='p'):

                resource['description'] = str(resource_link.\
                                        find_parent(name='p').
                                                contents[0]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
            else:  # set description to name of resource
                resource['description'] = resource['name']

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        if len(dataset['resources']) == 0:
            continue

        yield dataset

Example #14

Show file

File: opepd_parser2.py Project: tanvirchahal/edscrapers

def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # ensure that the response text gotten is a string
    if not isinstance(getattr(res, 'text', None), str):
        return None

    # create parser object
    try:
        soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')
    except:
        return None

    dataset_containers = soup_parser.body.find_all(class_='contentText',
                                                   recursive=True)

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.title'
                                                     }) is None:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']

        try:
            if soup_parser.head.find(name='meta',
                                     attrs={'name': 'DC.description'}) is None:
                dataset['notes'] = str(
                    soup_parser.body.find(class_='headersLevel1',
                                          recursive=True).string).strip()
            else:
                dataset['notes'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.description'})['content']
        except:
            dataset['notes'] = dataset['title']

        # if despite best efforts 'notes' is still empty or None
        if not dataset.get('notes', None):
            dataset['notes'] = dataset['title']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = soup_parser.body.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)

        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            for child in resource_link.parent.children:
                # check if the resource is contained in a table
                if resource_link.parent.name == 'td':
                    resource['name'] = str(
                        resource_link.find_parent(
                            name='tr').contents[1]).strip()
                else:
                    resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])

            if resource_link.find_parent(class_='contentText').\
                find_previous_sibling(class_='headersLevel2'):

                # concatenate the text content of parents with
                # class 'headersLevel1' & 'headersLevel2'
                resource['description'] = str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel1').\
                                                contents[0]).strip() +\
                                            " - " + str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel2').\
                                                contents[0]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])
            else:
                # concatenate the text content of parents with
                # class 'headersLevel1' & 'contentText'
                resource['description'] = str(resource_link.\
                                        find_parent(class_='contentText').\
                                            find_previous_sibling(class_='headersLevel1').\
                                                contents[0]).strip() +\
                                            " - " + str(resource_link.\
                                        find_parent(class_='contentText').\
                                                contents[0].string or resource_link.\
                                        find_parent(class_='contentText').\
                                                contents[0]).strip()
                resource['description'] = re.sub(r'(</.+>)', '',
                                                 resource['description'])
                resource['description'] = re.sub(r'(<.+>)', '',
                                                 resource['description'])

            # after getting the best description possible, remove any " - "
            # and trailing white space
            resource['description'] = re.sub(r'^\s+\-\s+', '',
                                             resource.get('description', ''))
            resource['description'] = resource['description'].strip()

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        if len(dataset['resources']) == 0:
            continue

        yield dataset

Example #15

Show file

def parse(res, publisher) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.find_all(name='body')

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        dataset['title'] = soup_parser.select(
            'div.MainContent > div.IndicatorList > h4 > a')[0].string

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        dataset['publisher'] = publisher

        dataset['notes'] = dataset['title']

        if soup_parser.find(name='div', attrs={'class': 'ReportSource'}):
            report_source = soup_parser.find(name='div',
                                             attrs={'class': 'ReportSource'})
            dataset[
                'notes'] = f"{dataset['notes']}<br>\n{report_source.string}"

        if 'statedetail.aspx' in res.url:
            dataset[
                'notes'] = f"{dataset['notes']}<br>\n<a href='{res.url.replace('statedetail', 'moreinfo')}'>More Info</a>"

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            file_name = resource_link['href'].split('.')[-2]
            resource['name'] = h.unslugify(file_name.split('/')[-1].strip())

            try:
                resource['description'] = report_source.string
            except:
                resource['description'] = dataset['title']

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)
        if len(dataset['resources']) == 0:
            continue

        yield dataset

Example #16

Show file

File: sites_ed_gov_parser1.py Project: osahon-okungbowa/edscrapers

def parse(res, publisher) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.find_all(name='div',
                                                   id='page',
                                                   recursive=True)

    # check if this page is a collection (i.e. collection of datasets)
    if len(dataset_containers) > 0:  # this is a collection
        # create the collection (with a source)
        collection = h.extract_dataset_collection_from_url(collection_url=res.url,
                                        namespace="all",
                                        source_url=\
                                        str(res.request.headers.get(str(b'Referer',
                                                                    encoding='utf-8'), b''),
                                            encoding='utf-8'))

    # print(dataset_containers)

    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        # import ipdb; ipdb.set_trace()
        try:
            dataset['title'] = str(
                container.find(class_='site-title').string).strip()
        except:
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        # get publisher from parent package name
        dataset['publisher'] = publisher

        try:
            dataset['notes'] = str(container.find(_class='site-description').string).\
                                  strip()
        except:
            dataset['notes'] = dataset['title']

        dataset['tags'] = ''
        dataset['date'] = ''
        dataset['contact_person_name'] = ""
        dataset['contact_person_email'] = ""

        # specify the collection which the dataset belongs to
        if collection:  # if collection exist
            dataset['collection'] = collection

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url,
                                url=resource_link['href'],
                                name=str(resource_link.string).strip())

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        # if this dataset alread has data resource files look for
        # document resource files
        if len(dataset['resources']) > 0:
            yield dataset

Example #17

Show file

def parse(res) -> dict:
    """ function parses content to create a dataset model """

    # create parser object
    soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')

    dataset_containers = soup_parser.body.select('div.MainContent')
    for container in dataset_containers:
        # create dataset model dict
        dataset = Dataset()
        dataset['source_url'] = res.url

        if (soup_parser.head.find(name='meta',attrs={'name': 'DC.title'}) is None)\
            or (soup_parser.head.find(name='meta',attrs={'name': 'DC.title'})['content']\
                is None or\
                    soup_parser.head.find(name='meta',attrs={'name': 'DC.title'})['content'] == ""):
            dataset['title'] = str(soup_parser.head.\
                                find(name='title').string).strip()
        else:
            dataset['title'] = soup_parser.head.find(
                name='meta', attrs={'name': 'DC.title'})['content']

        # replace all non-word characters (e.g. ?/) with '-'
        dataset['name'] = slugify(dataset['title'])
        if soup_parser.head.find(name='meta', attrs={'name': 'ED.office'
                                                     }) is None:
            dataset['publisher'] = __package__.split('.')[-2]
        else:
            dataset['publisher'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'ED.office'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.description'
                                                     }) is None:
            dataset['notes'] = dataset['title']
        else:
            dataset['notes'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'DC.description'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'keywords'
                                                     }) is None:
            dataset['tags'] = ''
        else:
            dataset['tags'] = soup_parser.head.\
                                find(name='meta', attrs={'name': 'keywords'})['content']

        if soup_parser.head.find(name='meta', attrs={'name': 'DC.date.valid'
                                                     }) is None:
            dataset['date'] = ''
        else:
            dataset['date'] = soup_parser.head.\
                                    find(name='meta', attrs={'name': 'DC.date.valid'})['content']

        dataset['contact_person_name'] = ""

        dataset['contact_person_email'] = ""

        dataset['resources'] = list()

        # add  resources from the 'container' to the dataset
        page_resource_links = container.find_all(
            name='a', href=base_parser.resource_checker, recursive=True)
        for resource_link in page_resource_links:
            resource = Resource(source_url=res.url, url=resource_link['href'])
            # get the resource name iteratively
            for child in resource_link.parent.children:
                resource['name'] = str(child).strip()
                if re.sub(r'(<.+>)', '',
                          re.sub(r'(</.+>)', '', resource['name'])) != "":
                    break
            resource['name'] = re.sub(r'(</.+>)', '', resource['name'])
            resource['name'] = re.sub(r'(<.+>)', '', resource['name'])
            # the page structure has NO description available for resources
            resource['description'] = ''

            # get the format of the resource from the file extension of the link
            resource_format = resource_link['href']\
                            [resource_link['href'].rfind('.') + 1:]
            resource['format'] = resource_format

            # Add header information to resource object
            resource['headers'] = h.get_resource_headers(
                res.url, resource_link['href'])

            # add the resource to collection of resources
            dataset['resources'].append(resource)

        # check if created dataset has resources attached.
        if len(dataset['resources']) == 0:  # no resources so don't yield it
            continue  # skip this loop

        yield dataset