Example #1
0
def scrape(entrance,
           limit=None,
           ignore=[
               '', 'iso region', 'capital', 'population', 'area in km²',
               'lang', 'continent', 'from', 'till'
           ]):
    def report(signum, frame):
        print("co:", country['Country'].text, file=sys.stderr)

    body = request.urlopen(entrance).read()
    doc = bs4.BeautifulSoup(body)
    countries = doc.select('table#countries tr')
    country = Row(countries.pop(0))
    regions = []
    fields = OrderedDict()
    signal.signal(signal.SIGINFO, report)
    for row in countries:
        if limit:
            limit -= 1
        elif 0 == limit:
            break
        country.data = row
        coUrl = parse.urljoin(entrance, country['Country'].a['href'])
        url = getRegionUrl(coUrl)
        (data, rgnFields) = getRegionData(country['ISO-3166alpha2'].text, url)
        regions += data
        fields.update(rgnFields)
    signal.signal(signal.SIGINFO, signal.SIG_DFL)
    for field in ignore:
        if field in fields:
            del fields[field]
    fields['name of subdivision'] = 'name'
    return (regions, fields)
Example #2
0
def scrape(entrance, limit=None, ignore=['', 'iso region', 'capital', 'population', 'area in km²', 'lang', 'continent', 'from', 'till']):
    def report(signum, frame):
        print("co:", country['Country'].text, file=sys.stderr)
    body = request.urlopen(entrance).read()
    doc = bs4.BeautifulSoup(body)
    countries = doc.select('table#countries tr')
    country = Row(countries.pop(0))
    regions = []
    fields = OrderedDict()
    signal.signal(signal.SIGINFO, report)
    for row in countries:
        if limit:
            limit -= 1
        elif 0 == limit:
            break
        country.data = row
        coUrl = parse.urljoin(entrance, country['Country'].a['href'])
        url = getRegionUrl(coUrl)
        (data, rgnFields) = getRegionData(country['ISO-3166alpha2'].text, url)
        regions += data
        fields.update(rgnFields)
    signal.signal(signal.SIGINFO, signal.SIG_DFL)
    for field in ignore:
        if field in fields:
            del fields[field]
    fields['name of subdivision'] = 'name'
    return (regions, fields)
Example #3
0
def getRegionData(co, url):
    body = request.urlopen(url).read()
    doc = bs4.BeautifulSoup(body)
    data = []
    fields = OrderedDict.fromkeys(['country'])
    for table in [x for x in doc('table', 'restable') if 'subdivtable' in x['id']]:
        rows = table('tr')
        template = Row(rows.pop(0))
        template.default = None
        template.stringify = True
        fields.update(template.fields)
        for row in rows:
            region = copy(template)
            region.data = row
            region.data['country'] = co
            data.append(region)
    return (data, fields)
Example #4
0
def getRegionData(co, url):
    body = request.urlopen(url).read()
    doc = bs4.BeautifulSoup(body)
    data = []
    fields = OrderedDict.fromkeys(['country'])
    for table in [
            x for x in doc('table', 'restable') if 'subdivtable' in x['id']
    ]:
        rows = table('tr')
        template = Row(rows.pop(0))
        template.default = None
        template.stringify = True
        fields.update(template.fields)
        for row in rows:
            region = copy(template)
            region.data = row
            region.data['country'] = co
            data.append(region)
    return (data, fields)