Python download Examples, scrapeutils.download Python Examples

Example #1

0

Show file

File: parse.py Project: KohoVolit/scraper-sk_nrsr

def mp(id, term):
    """Parse MP from his profile webpage."""
    if term and term not in terms.keys():
        raise ValueError("unknown term '%s'" % term)

    url = 'http://www.nrsr.sk/web/Default.aspx?sid=poslanci/poslanec&PoslanecID=%s&CisObdobia=%s' % (id, term)
    content = scrapeutils.download(url)
    if 'Unexpected error!' in content:
        raise RuntimeError("MP with id '%s' does not exist in term '%s'" % (id, term))
    html = lxml.html.fromstring(content)

    result = {
        'id': str(id),
        'url': url
    }
    for div in html.findall('.//div[@class="mp_personal_data"]//div[strong]'):
        label = div.findtext('strong')
        value = div.find('span')
        result[label.lower()] = value.text_content() if value is not None else ''

    image_url = html.find('.//div[@class="mp_foto"]/img').get('src')
    image = requests.get(image_url).content
    with open(os.path.join(BASE_DIR, 'dummy-image.jpg'), 'rb') as f:
        dummy_image = f.read()
    result['fotka'] = image_url if image != dummy_image else ''

    result['členstvo'] = []
    ul = html.find('.//span[@id="_sectionLayoutContainer_ctl01_ctlClenstvoLabel"]').getparent().getnext()
    for li in ul.findall('li'):
        m = re.search(r'(.*?)\s*\((.*?)\)', li.text)
        result['členstvo'].append({'meno': m.group(1), 'rola': m.group(2)})

    return scrapeutils.plaintext(result)

Example #2

0

Show file

File: parse.py Project: KohoVolit/scraper-sk_nrsr

def current_term():
    url = 'http://www.nrsr.sk/web/default.aspx?sid=poslanci'
    content = scrapeutils.download(url)
    html = lxml.html.fromstring(content)

    option = html.find('.//select[@id="_sectionLayoutContainer_ctl01__currentTerm"]/option[@selected]')
    return option.get('value')

Example #3

0

Show file

File: parse.py Project: KohoVolit/scraper-sk_nrsr

def debate_of_term1(id):
    """Parse a debate transcript in term 1 format and return list of
    its paragraphs' text content."""
    # download the debate transcript or use a local fixed debate if there is one
    filename = os.path.join('fixed_debates', 'debate_%s.html' % id)
    if os.path.exists(filename):
        with open(filename, 'r') as f:
            content = f.read()
    else:
        url = 'http://www.nrsr.sk/dl/Browser/Document?documentId=%s' % id
        content = scrapeutils.download(url)
        if 'Unexpected error!' in content:
            raise RuntimeError("Debate with id '%s' does not exist" % id)

    # fix markup and parse to HTML tree
    content = content.replace('12. 9. 1995<o:p></o:p>', '12. septembra 1995')
    content = content.replace('<o:p></o:p>', '')
    html = lxml.html.fromstring(content)

    # extract paragraph texts, use blank line as paragraph separator
    result = []
    text = ''
    for par in html.findall('.//p'):
        line = scrapeutils.plaintext(par.text_content())
        if len(line) > 0 and not re.match(r'\w+ deň rokovania', line):
            text += '\n%s' % line
        else:
            if text:
                result.append(scrapeutils.clear_hyphens(text, '\n'))
            text = line
    result.append(scrapeutils.clear_hyphens(text, '\n'))

    return scrapeutils.plaintext(result)

Example #4

0

Show file

File: parse.py Project: KohoVolit/scraper-sk_nrsr

def speaker():
    """Parse current speaker (predseda) of the chamber."""
    url = 'http://www.nrsr.sk/web/default.aspx?sid=predseda'
    content = scrapeutils.download(url)
    html = lxml.html.fromstring(content)

    div = html.find(".//div[@id='_sectionLayoutContainer__panelContent']")
    result = {
        'url': url,
        'meno': div.find(".//h1").text_content(),
    }

    image = div.find('.//img')
    if image is not None:
        result['fotka'] = 'http://www.nrsr.sk/web/' + image.get('src')

    born = div.find("div[@class='article']")
    if born is not None:
        result['narodený'] = re.search(r'Narodený: (.*)', born.text_content()).group(1)

    bio = div.find('table')
    if bio is not None:
        result['životopis'] = lxml.html.tostring(bio, encoding='unicode', with_tail=False)

    return scrapeutils.plaintext(result)

Example #5

0

Show file

File: parse.py Project: KohoVolit/scraper-sk_nrsr

def session_list(term=None):
    """Parse list of sessions in one term of office of the parliament."""
    if term and term not in terms.keys():
        raise ValueError("unknown term '%s'" % term)

    url = 'http://www.nrsr.sk/web/default.aspx?sid=schodze/hlasovanie/schodze'
    content = scrapeutils.download(url)
    html = lxml.html.fromstring(content)

    # scraping for older terms requires another POST request to emulate selectbox choice
    if term:
        data = {
            '_sectionLayoutContainer$ctl01$_termsCombo': term,
            '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'),
            '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'),
        }
        ext = '|%s' % term
        content = scrapeutils.download(url, 'POST', data, ext)
        html = lxml.html.fromstring(content)

    # pick list items
    result = {
        'url': url,
        '_items': []
    }
    for li in html.findall('.//div[@id="_sectionLayoutContainer__panelContent"]//ul//li'):
        a = li.find('a')
        link = a.get('href')
        session = {
            'číslo': re.search(r'CisSchodze=(\d+)', link).group(1),
            'názov': a.text,
            'trvanie': re.search(r'\((.+?)\)', li.text_content()).group(1),
            'url': 'http://www.nrsr.sk/web/' + link,
        }
        result['_items'].append(session)

    return scrapeutils.plaintext(result)

Example #6

0

Show file

File: parse.py Project: KohoVolit/scraper-sk_nrsr

def mp_list(term=None):
    """Parse list of MPs."""
    if term and term not in terms.keys():
        raise ValueError("unknown term '%s'" % term)
    term = term or max(terms.keys())

    url = 'http://www.nrsr.sk/web/Default.aspx?sid=poslanci/zoznam_abc&ListType=0&CisObdobia=%s' % term
    content = scrapeutils.download(url)
    html = lxml.html.fromstring(content)

    result = {
        'url': url,
        '_items': [{
            'id': re.search(r'PoslanecID=(\d+)', mp.get('href')).group(1),
            'meno': mp.text,
        } for mp in html.findall('.//div[@class="mps_list"]//li/a')]
    }

    return scrapeutils.plaintext(result)

Example #7

0

Show file

File: parse.py Project: KohoVolit/scraper-sk_nrsr

def debate_of_terms56(id):
    """Parse a debate transcript in terms 5-6 format and return its
    structure."""
    # download the debate transcript
    url = 'http://tv.nrsr.sk/transcript?id=%s' % id
    content = scrapeutils.download(url)

    # parse to HTML tree
    html = lxml.html.fromstring(content)
    main_block = html.find('body/div')
    if not len(main_block) and not main_block.text.strip():
        result = {'riadky': []}
    else:
        # parse headings and individual lines used as paragraphs
        main_content = lxml.html.tostring(main_block, encoding='unicode', with_tail=False)
        main_content = main_content[len('<div>'):-len('</div>')]
        main_content = main_content.replace('<p>', '').replace('</p>', '')
        result = {'riadky': re.split('<br\s*/?>', main_content)}

    return scrapeutils.plaintext(result)

Example #8

0

Show file

File: parse.py Project: KohoVolit/scraper-sk_nrsr

def debate_of_terms234(id):
    """Parse a debate transcript in terms 2-4 format and return list of
    its paragraphs' text content."""
    # download RTF file or use a local fixed debate if there is one
    filename = os.path.join('fixed_debates', 'debate_%s.rtf' % id)
    if not os.path.exists(filename):
        url = 'http://www.nrsr.sk/dl/Browser/Document?documentId=%s' % id
        rtf = scrapeutils.download(url)
        filename = os.path.join(scrapeutils.WEBCACHE_PATH, 'debate_%s.rtf' % id)
        with open(filename, 'w') as f:
            f.write(rtf)

    # convert from RTF to HTML using unoconv using LibreOffice
    content = subprocess.check_output(['unoconv', '-f', 'html', '--stdout', filename])
    html = lxml.html.fromstring(content)

    result = []
    for par in html.findall('./body/p'):
        result.append(par.text_content())

    return scrapeutils.plaintext(result)

Example #9

0

Show file

File: parse.py Project: KohoVolit/scraper-sk_nrsr

def old_debates_list(term):
    """Parse list of debates for the given term of office from NRSR
    Digital Library.
    Appropriate for older terms (1.-4.) where debates are not split
    by speaker."""
    if term not in ['1', '2', '3', '4']:
        raise ValueError("Old style transcripts are not available for term '%s'" % term)

    base_url = 'http://www.nrsr.sk/dl/Browser/Grid?nodeType=DocType&legId=13&chamberId=0' + \
            '&categoryId=1&committeeId=0&documentTypeId=5&folderId=0&meetingNr=' + \
            '&termNr=%s' % term
    result = {
        'url': base_url,
        '_items': []
    }
    page = 0
    while True:
        url = base_url + '&pageIndex=%s' % page
        content = scrapeutils.download(url)
        html = lxml.html.fromstring(content)

        # extract all debates from the current page
        for tr in html.findall('.//table[@class="resultTable"]//tr'):
            sequence_number = tr.findtext('td[1]/a')
            title = tr.find('td[2]/a')
            doc_id = re.search(r'documentId=(\d+)', title.get('href'))
            debate = {
                'časť': sequence_number,
                'názov': title.text,
                'url': 'http://www.nrsr.sk' + title.get('href'),
                'id': doc_id.group(1)
            }
            result['_items'].append(debate)

        page += 1
        pages = html.findtext('.//div[@class="pager"]/span[last()]')
        if page >= int(pages): break

    return scrapeutils.plaintext(result)

Example #10

0

Show file

File: parse.py Project: KohoVolit/scraper-sk_nrsr

def deputy_speakers():
    """Parse current deputy speakers (podpredsedovia) of the chamber."""
    url = 'http://www.nrsr.sk/web/default.aspx?sid=podpredsedovia'
    content = scrapeutils.download(url)
    html = lxml.html.fromstring(content)

    result = []
    for div in html.findall(".//div[@class='vicechairman_bigbox']"):
        name = div.find('.//a')
        link = name.get('href')
        id = re.search(r'PoslanecID=(\d+)', link)
        description = div.find(".//div[@class='vicechairman_description']")

        result.append({
            'fotka': 'http://www.nrsr.sk/web/' + div.find('.//img').get('src'),
            'meno': name.text,
            'url': 'http://www.nrsr.sk/web/' + link,
            'id': id.group(1),
            'kandidoval(a) za': description.find('div[1]/strong').tail,
            'narodený(á):': description.find('div[2]/strong').tail,
            'národnosť': description.find('div[3]/strong').tail,
        })

    return scrapeutils.plaintext(result)

Example #11

0

Show file

File: parse.py Project: KohoVolit/scraper-sk_nrsr

def new_debates_list(term, since_date=None, until_date=None):
    """Parse list of debate parts for the given term of office from
    NRSR web. Appropriate for newer terms (since 5th) where split
    debates are available. If `since_date` or `until_date` is given
    in ISO format only the debate parts since/until that date are
    returned.
    """
    if term not in ['5', '6', '7']:
        raise ValueError("Parsed transcripts are not available for term '%s'" % term)

    url = 'http://www.nrsr.sk/web/Default.aspx?sid=schodze/rozprava'
    content = scrapeutils.download(url)
    html = lxml.html.fromstring(content)

    # a POST request to emulate choice of term in second selectbox and pressing the button
    data = {
        '_sectionLayoutContainer$ctl01$_termNr': term,
        '_sectionLayoutContainer$ctl01$_search': 'Vyhľadať',
        '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'),
        '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'),
    }
    base_ext = '|new|%s' % term
    if since_date:
        data['_sectionLayoutContainer$ctl01$_dateFrom$dateInput'] = since_date + '-00-00-00'
        base_ext += '|s%s' % since_date
    if until_date:
        data['_sectionLayoutContainer$ctl01$_dateTo$dateInput'] = since_date + '-00-00-00'
        base_ext += '|u%s' % since_date
    content = scrapeutils.download(url, 'POST', data, base_ext)
    html = lxml.html.fromstring(content)

    result = {
        'url': url,
        '_items': []
    }
    page = 1
    while True:
        # extract all debate parts from the current page
        for tr in html.findall('.//table[@id="_sectionLayoutContainer_ctl01__newDebate"]/tr'):
            if tr.get('class') in ('pager', 'tab_zoznam_header'): continue
            session_number = tr.find('td[1]')
            date = tr.find('td[2]')
            time_interval = tr.find('td[3]')
            time = re.search(r'(.*?) - (.*)', time_interval.text)
            part_type = time_interval.find('em')
            speaker = tr.find('td[4]')
            speaker_label = speaker.find('br').tail.strip('( ')
            debate_part = {
                'schôdza': session_number.text.replace('.', ''),
                'dátum': date.text,
                'trvanie': {'od': time.group(1), 'do': time.group(2)},
                'druh': part_type.text or '',
                'osoba': {'meno': speaker.findtext('strong'), 'funkcia': speaker_label}
            }
            speaker_link = speaker.find('a')
            if speaker_link is not None:
                speaker_url = speaker_link.get('href')
                id = re.search(r'PoslanecID=(\d+)', speaker_url)
                debate_part['osoba']['url'] = speaker_url
                debate_part['osoba']['id'] = id.group(1)
            for a in tr.findall('td[5]/a'):
                link = a.get('href')
                src = a.find('img').get('src')
                if 'speak' in src:
                    id = re.search(r'id=(\d+)', link)
                    debate_part['video'] = {'url': link, 'id': id.group(1)}
                elif 'all' in src:
                    debate_part['video_rokovania'] = {'url': link}
                elif 'rewrite' in src:
                    id = re.search(r'id=(\d+)', link)
                    debate_part['prepis'] = {'url': link, 'id': id.group(1)}
                else:
                    raise RuntimeError('Unrecognized link in section %s/%s/%s' %
                        (session_number.text, date.text, time_interval.text))
            result['_items'].append(debate_part)

        # test if there is a link to next page
        current_page = html.find('.//table[@id="_sectionLayoutContainer_ctl01__newDebate"]//tr[1]//span')
        if current_page is None: break
        next_page = current_page.getparent().getnext()
        if next_page is None: break
        page += 1

        # a POST request to emulate pager click
        data = {
            '__EVENTTARGET': '_sectionLayoutContainer$ctl01$_newDebate',
            '__EVENTARGUMENT': 'Page$%s' % page,
            '_sectionLayoutContainer$ctl01$_termNr': term,
            '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'),
            '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'),
        }
        ext = base_ext + '|%s' % page
        content = scrapeutils.download(url, 'POST', data, ext)
        html = lxml.html.fromstring(content)

    return scrapeutils.plaintext(result)

Example #12

0

Show file

File: updater_1.py Project: KohoVolit/scraper-senat.cz

try:
    # get all senators by districts and all political groups
    baseurl = 'http://senat.cz/'
    people = []
    groups = {}
    iid = 100001

    ## people
    for i in range(1, 82):
        print(i)
        url = "http://senat.cz/senat/volby/hledani/o_obvodu.php?ke_dni=" + time.strftime(
            "%d") + "." + time.strftime("%m") + "." + time.strftime(
                "%Y") + "&O=10&kod=" + str(i)
        domtree = html.fromstring(
            bytes(scrapeutils.download(url), 'iso-8859-1').decode('utf-8')
        )  #note: senat.cz incorrectly send headers as iso/latin1 and so requests save it incorrectly - so fixing it here
        tables = domtree.xpath('//table[@class="tHistory"]')
        for table in tables:
            image = baseurl[:-1] + table.xpath('tr/td/img/@src')[0].replace(
                '_110', '')
            name = full_name2name(table.xpath('tr/td/img/@alt')[0])
            ident = re.search(
                'par_3=(\d{1,})',
                table.xpath('tr/td/a/@href')[0]).group(1).strip()
            people.append({
                'given_name':
                name['given_name'],
                'family_name':
                name['family_name'],
                'name':

Example #13

0

Show file

File: updater_2.py Project: KohoVolit/scraper-senat.cz

    p2id[p['name']] = p['id']


def pp2id(name, date, p2id):
    if name == 'Jiří Dienstbier':
        if date < '2011-01-08':
            return '218'
        else:
            return '253'
    else:
        return p2id[name]


scrapeutils.USE_WEBCACHE = False
url = "http://senat.cz/xqw/xervlet/pssenat/hlasa?S=&T=&H=&N=&K=&ID=275&Str=1&Poc=20000"
domtree = html.fromstring(scrapeutils.download(url))
scrapeutils.USE_WEBCACHE = True


def result2result(r):
    if r == 'přijato':
        return 'pass'
    else:
        return 'fail'


#motions, vote-events, votes:
def guess_majority(quorum, present):
    if int(quorum) == 49:
        return 'two-thirds representatives majority'
    if int(quorum) == 41 and int(present) < 81:

Example #14

0

Show file

File: parse.py Project: KohoVolit/scraper-sk_nrsr

def motion(id):
    """Parse a motion/vote-event with individual votes cast by MPs."""
    url = 'http://www.nrsr.sk/web/Default.aspx?sid=schodze/hlasovanie/hlasklub&ID=%s' % id
    content = scrapeutils.download(url)
    if 'Unexpected error!' in content:
        raise RuntimeError("Motion with id '%s' does not exist" % id)
    html = lxml.html.fromstring(content)

    panel = html.find('.//div[@id="_sectionLayoutContainer__panelContent"]')
    motion = panel.find('.//div[@class="voting_stats_summary_full"]')
    session_link = motion.find('div[1]//a').get('href')
    counts = panel.find('.//div[@id="_sectionLayoutContainer_ctl01_ctl00__resultsTablePanel"]/div')

    result = {
        'url': url,
        'schôdza': {
            'číslo': re.search(r'CisSchodze=(\d+)', session_link).group(1),
            'obdobie': re.search(r'CisObdobia=(\d+)', session_link).group(1),
            'url': 'http://www.nrsr.sk/web/' + session_link,
        },
        'dátum': motion.findtext('div[2]/span'),
        'číslo': motion.findtext('div[3]/span'),
        'názov': motion.findtext('div[4]/span'),
    }
    res = motion.findtext('div[5]/span')
    if res:
        result['výsledok'] = res
    if counts is not None:
        result['súčty'] = {
            'prítomní': counts.findtext('div[1]/span'),
            'hlasujúcich': counts.findtext('div[2]/span'),
            '[z] za': counts.findtext('div[3]/span'),
            '[p] proti': counts.findtext('div[4]/span'),
            '[?] zdržalo sa': counts.findtext('div[5]/span'),
            '[n] nehlasovalo': counts.findtext('div[6]/span'),
            '[0] neprítomní': counts.findtext('div[7]/span'),
        }

    mps = panel.find('.//div[@id="_sectionLayoutContainer_ctl01__bodyPanel"]')
    if mps is not None:
        result['hlasy'] = []
        for td in mps.findall('.//td'):
            if td.get('class') == 'hpo_result_block_title':
                parl_group = td.text.strip()
            else:
                if not td.text: continue
                vote = td.text[1].lower()
                a = td.find('a')
                family_name, _, given_name = a.text.partition(',')
                link = a.get('href')
                id = re.search(r'PoslanecID=(\d+)', link)
                mp = {
                    'meno': given_name.strip() + ' ' + family_name.strip(),
                    'klub': parl_group,
                    'hlas': vote,
                    'id': id.group(1),
                    'url': 'http://www.nrsr.sk/web/' + link
                }
                result['hlasy'].append(mp)

    related_docs = panel.findall('./ul/li[img]/a')
    if related_docs:
        result['dokumenty'] = [{
            'názov': a.text.strip(),
            'url': 'http://www.nrsr.sk/web/' + a.get('href')
        } for a in related_docs]

    return scrapeutils.plaintext(result)

Example #15

0

Show file

File: parse.py Project: KohoVolit/scraper-sk_nrsr

def session(session_number, term=None):
    """Parse a session, i.e. the list of voted motions."""
    if term and term not in terms.keys():
        raise ValueError("unknown term '%s'" % term)
    term = term or max(terms.keys())
    if not session_number.isdigit() or int(session_number) == 0:
        raise ValueError("Invalid session number '%s'" % session_number)

    url = 'http://www.nrsr.sk/web/Default.aspx?sid=schodze/hlasovanie/vyhladavanie_vysledok' + \
        '&ZakZborID=13&CisObdobia=%s&CisSchodze=%s&ShowCisloSchodze=False' % \
        (term, session_number)
    content = scrapeutils.download(url)

    result = {
        'url': url,
        '_items': []
    }
    if 'V systéme nie sú evidované žiadne hlasovania vyhovujúce zadanej požiadavke.' in content:
        return result
    html = lxml.html.fromstring(content)

    page = 1
    while True:
        # extract all motions from the current page
        for tr in html.findall('.//table[@id="_sectionLayoutContainer_ctl01__resultGrid2"]/tr'):
            if tr.get('class') in ('pager', 'tab_zoznam_header'): continue
            date = tr.find('td[1]')
            vote_event = tr.find('td[2]/a')
            vote_event_link = vote_event.get('href')
            id = re.search(r'ID=(\d+)', vote_event_link)
            motion = {
                'dátum': date.text_content(),
                'číslo': vote_event.text_content(),
                'názov': tr.findtext('td[4]'),
                'id': id.group(1),
                'url': {
                    'výsledok': 'http://www.nrsr.sk/web/' + vote_event_link,
                }
            }
            object = tr.find('td[3]/a')
            if object is not None:
                motion['čpt'] = {
                    'číslo': object.text_content(),
                    'url': 'http://www.nrsr.sk/web/' + object.get('href')
                }
            vote_link2 = tr.find('td[5]/a').get('href')
            if vote_link2:
                motion['url']['kluby'] = 'http://www.nrsr.sk/web/' + vote_link2
            result['_items'].append(motion)

        current_page = html.find('.//table[@id="_sectionLayoutContainer_ctl01__resultGrid2"]/tr[1]//span')
        if current_page is None: break
        next_page = current_page.getparent().getnext()
        if next_page is None: break

        # POST request to emulate pager click
        page += 1
        data = {
            '__EVENTTARGET': '_sectionLayoutContainer$ctl01$_resultGrid2',
            '__EVENTARGUMENT': 'Page$%s' % page,
            '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'),
            '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'),
        }
        ext = '|%s|%s' % (term, page)
        content = scrapeutils.download(url, 'POST', data, ext)
        html = lxml.html.fromstring(content)


    return scrapeutils.plaintext(result)

Example #16

0

Show file

File: memberships.py Project: KohoVolit/scraper-psp.cz

                update = False
                print("not updating: " + r['_items'][0]['id'])
        except:
            nothing = 0
        if update:
            vpapi.delete("memberships",r['_items'][0]['id'])
            self['id'] = r['_items'][0]['id']
            r = vpapi.post('memberships', self)
            print("updating: " + self['id'])
        
            
#        r = vpapi.put('memberships/%s' % r['_items'][0]['id'],self)
            if r['_status'] != 'OK':
                raise Exception(self.name, r)

zfile = scrapeutils.download('http://www.psp.cz/eknih/cdrom/opendata/poslanci.zip',zipped=True)
zarazeni = scrapeutils.zipfile2rows(zfile,'zarazeni.unl')

from datetime import datetime
i = 0
for row in zarazeni:
    r_org = vpapi.get('organizations', where={'identifiers': {'$elemMatch': {"identifier": row[1].strip(), "scheme": "psp.cz/organy"}}})
    if len(r_org["_items"]) > 0:
        r_pers = vpapi.get('people', where={'identifiers': {'$elemMatch': {"identifier": row[0].strip(), "scheme": "psp.cz/osoby"}}})
        if len(r_pers["_items"]) > 0:
            membership = {
                "label": "Člen",
                "role": "member",
                "person_id": r_pers["_items"][0]['id'],
                "organization_id": r_org["_items"][0]['id'],
#                "id": str(i),

Example #17

0

Show file

File: first_run_rosnicka.py Project: KohoVolit/scraper-psp.cz

            "vote_event_id": r_voteevent["_items"][0]["id"],
        }
        try:
            votes[r_voteevent["_items"][0]["id"]]
        except:
            votes[r_voteevent["_items"][0]["id"]] = []
        votes[r_voteevent["_items"][0]["id"]].append(vote.copy())
    #    for k in votes:
    #        vpapi.post("votes",votes[k])
    vpapi.post("votes", votes)


j = 0
for term in terms:
    print(term)
    zfile = scrapeutils.download("http://www.psp.cz/eknih/cdrom/opendata/hl-" + str(term) + "ps.zip", zipped=True)
    # hl_hlasovani = scrapeutils.zipfile2rows(zfile,'hl'+str(term)+'s.unl')
    for i in range(1, 4):
        print(i)
        try:
            hl_poslanec = scrapeutils.zipfile2rows(zfile, "hl" + str(term) + "h" + str(i) + ".unl")
            # savevotes(hl_poslanec)
            # savevotes(hl_poslanec)
            votes = {}
            votesli = []
            voteevents = {}
            people = {}
            organizations = {}
            terms = {}
            for rowp in hl_poslanec:
                if rowp[1] in rosnicka_vote_events:

Example #18

0

Show file

File: first_run_motion_ve.py Project: KohoVolit/scraper-psp.cz

logname = datetime.utcnow().strftime('%Y-%m-%d-%H%M%S') + '.log'
logname = os.path.join(LOGS_DIR, logname)
logname = os.path.abspath(logname)
logging.basicConfig(level=logging.DEBUG, format='%(message)s', handlers=[logging.FileHandler(logname, 'w', 'utf-8')])
logging.getLogger('requests').setLevel(logging.ERROR)

logging.info('Started')
db_log = vpapi.post('logs', {'status': 'running', 'file': logname, 'params': []})
            

terms = [1993, 1996, 1998, 2002, 2006, 2010, 2013]
terms = [2013]
test = {}
#terms = [2010]
for term in terms:
    zfile = scrapeutils.download('http://www.psp.cz/eknih/cdrom/opendata/hl-'+str(term)+'ps.zip',zipped=True)
    hl_hlasovani = scrapeutils.zipfile2rows(zfile,'hl'+str(term)+'s.unl')
    saveallmotionsandvoteevents(hl_hlasovani)



#j = 0
#last_ve_id = 0
#voteevents = {}
#people = {}
#organizations = {}
#for term in terms:
#    logging.info('Started year ' + str(term))
#    print('http://www.psp.cz/eknih/cdrom/opendata/hl-'+str(term)+'ps.zip')
#    zfile = scrapeutils.download('http://www.psp.cz/eknih/cdrom/opendata/hl-'+str(term)+'ps.zip',zipped=True)
#    #hl_hlasovani = scrapeutils.zipfile2rows(zfile,'hl'+str(term)+'s.unl')

Example #19

0

Show file

File: parse.py Project: KohoVolit/scraper-sk_nrsr

def change_list(term=None):
    """Parse list of chamber membership changes."""
    term = term or max(terms.keys())
    if term not in terms.keys():
        raise ValueError("unknown term '%s'" % term)

    url = 'http://www.nrsr.sk/web/default.aspx?sid=poslanci/zmeny'
    content = scrapeutils.download(url)
    html = lxml.html.fromstring(content)

    # POST request to emulate term selection
    data = {
        '__EVENTTARGET': '_sectionLayoutContainer$ctl01$_currentTerm',
        '_sectionLayoutContainer$ctl01$_currentTerm': term,
        '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'),
        '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'),
        }
    ext = '|%s|1' % term
    content = scrapeutils.download(url, 'POST', data, ext)
    html = lxml.html.fromstring(content)

    result = {
        'url': url,
        '_items': []
    }
    page = 1
    while True:
        # extract all changes from the current page
        for tr in html.findall('.//table[@id="_sectionLayoutContainer_ctl01__ResultGrid2"]/tr'):
            if tr.get('class') in ('pager', 'tab_zoznam_header'): continue
            date = tr.findtext('td[1]')
            poslanec = tr.find('td[2]')
            text = re.search(r'(\S.*?)\s*\((.*?)\)', poslanec.text_content())
            link = poslanec.find('a').get('href')
            id = re.search(r'PoslanecID=(\d+)', link)
            result['_items'].append({
                'dátum': tr.findtext('td[1]'),
                'poslanec': {
                    'meno': text.group(1),
                    'url': 'http://www.nrsr.sk/web/' + link,
                    'id': id.group(1),
                    'klub': text.group(2),
                },
                'zmena': tr.findtext('td[3]'),
                'dôvod': tr.findtext('td[4]'),
            })

        current_page = html.find('.//table[@id="_sectionLayoutContainer_ctl01__ResultGrid2"]/tr[1]//span')
        if current_page is None: break
        next_page = current_page.getparent().getnext()
        if next_page is None: break

        # POST request to emulate pager click
        page += 1
        data = {
            '__EVENTTARGET': '_sectionLayoutContainer$ctl01$_ResultGrid2',
            '_sectionLayoutContainer$ctl01$_currentTerm': term,
            '__EVENTARGUMENT': 'Page$%s' % page,
            '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'),
            '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'),
        }
        ext = '|%s|%s' % (term, page)
        content = scrapeutils.download(url, 'POST', data, ext)
        html = lxml.html.fromstring(content)

    return scrapeutils.plaintext(result)

Example #20

0

Show file

File: parse.py Project: KohoVolit/scraper-sk_nrsr

def group(type, id):
    """Parse group of a given type (committee, parliamentary group, delegation, friendship group)
    from its profile webpage."""
    types = {
        'committee': {
            'url': 'http://www.nrsr.sk/web/Default.aspx?sid=vybory/vybor&ID=',
            'members_xpath': './/table[@class="tab_zoznam"]//tr',
            'name_xpath': 'td[1]/a/strong',
        },
        'parliamentary group': {
            'url': 'http://www.nrsr.sk/web/Default.aspx?sid=poslanci/kluby/klub&ID=',
            'members_xpath': './/table[@class="tab_zoznam"]//tr',
            'name_xpath': 'td[1]/a/strong',
        },
        'delegation': {
            'url': 'http://www.nrsr.sk/web/Default.aspx?sid=eu/delegacie/delegacia&ID=',
            'members_xpath': './/table[@class="tab_details"]//tr',
            'name_xpath': 'td[1]/strong/a',
        },
        'friendship group': {
            'url': 'http://www.nrsr.sk/web/Default.aspx?sid=eu/sp/sp&SkupinaId=',
            'members_xpath': './/table[@class="tab_details"]//tr',
            'name_xpath': 'td[1]/strong/a',
        },
    }

    if type not in types:
        raise ValueError("unknown type of group '%s'" % type)
    url = types[type]['url'] + str(id)

    content = scrapeutils.download(url)
    if 'Unexpected error!' in content:
        raise RuntimeError("group of type '%s' with id '%s' not found")

    content = content.replace('member_vez', 'member')    # exception in committee with id=119
    html = lxml.html.fromstring(content)

    result = {
        'id': str(id),
        'url': url
    }
    result['názov'] = html.findtext('.//h1')
    podnadpis = html.find('.//h2/span')
    if podnadpis is not None:
        p = podnadpis.text_content()
        if p not in ('', 'Zoznam členov'):
            result['podnadpis']  = p
    opis = html.find('.//span[@id="_sectionLayoutContainer_ctl01__basicInfoText"]')
    if opis is not None:
        result['opis'] = lxml.html.tostring(opis, encoding='unicode', with_tail=False)

    # current term and older terms are displayed differently
    if 'Zoznam členov' in content:
        # scraping current term - contact information and member cards
        for tr in html.findall('.//table[@class="tab_details"]//tr'):
            label = tr.findtext('td[1]/span')
            if not label: continue
            value = tr.find('td[2]/span')
            result[label.lower().rstrip('.:')] = value.text_content() if value is not None else ''

        other_docs = html.find('.//a[@id="_sectionLayoutContainer_ctl01__otherDocumentsLink"]')
        if other_docs is not None:
            result['ďalšie dokumenty'] = other_docs.get('href')

        result['členovia'] = []
        for div in html.findall('.//div[@class="member"]'):
            member = {
                'id': re.search(r'PoslanecID=(\d+)', div.find('.//a').get('href')).group(1),
                'fotka':  'http://www.nrsr.sk/web/' + div.find('.//img').get('src'),
                'meno': div.findtext('.//a/strong'),
                'obdobia': [{'rola': div.findtext('.//span[1]').lower()}],
            }
            if type != 'parliamentary group':
                member['klub'] = div.findtext('.//em')[1:-1]
                if member['klub'] in ('-', 'nie je členom poslaneckého klubu'):
                    member['klub'] = None
                elif not member['klub'].startswith('Klub '):
                    member['klub'] = 'Klub ' + member['klub']
            result['členovia'].append(member)

    else:
        # scraping older terms - list of members with membership roles and durations
        result['členovia'] = []
        for i, tr in enumerate(html.findall(types[type]['members_xpath'])):
            if type in ('parliamentary group', 'committee') and i < 2: continue
            member = {
                'id': re.search(r'PoslanecID=(\d+)', tr.find('td[1]//a').get('href')).group(1),
                'meno': tr.findtext(types[type]['name_xpath']),
                'obdobia': [],
            }
            for period in tr.findtext('td[2]').split(', '):
                membership = re.search(r'([^\(]*)\((.+?) - (.+?)\)', period, re.DOTALL)
                if membership:
                    member['obdobia'].append({
                        'rola': membership.group(1),
                        'od': membership.group(2),
                        'do': membership.group(3),
                    })
            result['členovia'].append(member)

    return scrapeutils.plaintext(result, ['opis'])

Example #21

0

Show file

File: scraper_people.py Project: Kedrigern/praha.eu-scraper

# scrape current representatives from praha.eu and save them in temp datafiles

import scrapeutils
from lxml import html, etree
import csv
import re

outfile = open('tempdata/current_people.csv', 'w')
outwriter = csv.writer(outfile, quoting=csv.QUOTE_NONNUMERIC)

url = "http://www.praha.eu/jnp/cz/o_meste/primator_a_volene_organy/zastupitelstvo/seznam_zastupitelu/index.html?size=100"
domtree = html.fromstring(scrapeutils.download(url))
trs = domtree.xpath('//tbody/tr')

# for each person
for tr in trs:
    row = []
    tds = tr.xpath('td')
    row.append(re.search('memberId=(\d{1,})',tds[0].xpath('a/@href')[0]).group(1).strip())
    row.append(tds[0].xpath('a')[0].text.strip())
    try:
        row.append(tds[1].text.strip())
    except:
        row.append('')
    row.append(tds[2].xpath('a')[0].text.strip())
    outwriter.writerow(row)
outfile.close()

Example #22

0

Show file

File: parse.py Project: KohoVolit/scraper-sk_nrsr

def group_list(type, term=None):
    """Parse list of groups of a given type (committee, parliamentary group, delegation, friendship group)."""
    types = {
        'committee': {
            'url': 'http://www.nrsr.sk/web/default.aspx?SectionId=77',
            'term_param_name': '_sectionLayoutContainer$ctl02$_currentTerm',
        },
        'parliamentary group': {
            'url': 'http://www.nrsr.sk/web/default.aspx?SectionId=69',
            'term_param_name': '_sectionLayoutContainer$ctl02$_currentTerm',
        },
        'delegation': {
            'url': 'http://www.nrsr.sk/web/default.aspx?sid=eu/delegacie/zoznam',
            'term_param_name': '_sectionLayoutContainer$ctl01$_currentTerm',
        },
        'friendship group': {
            'url': 'http://www.nrsr.sk/web/default.aspx?sid=eu/sp/zoznam',
            'term_param_name': '_sectionLayoutContainer$ctl01$_currentTerm',
        },
    }

    if type not in types:
        raise ValueError("unknown type of group '%s'" % type)
    if term and term not in terms.keys():
        raise ValueError("unknown term '%s'" % term)

    content = scrapeutils.download(types[type]['url'])
    html = lxml.html.fromstring(content)

    # scraping for older terms requires another POST request to emulate selectbox choice
    if term:
        data = {
            types[type]['term_param_name']: term,
            '__VIEWSTATE': html.find('.//input[@id="__VIEWSTATE"]').get('value'),
            '__EVENTVALIDATION': html.find('.//input[@id="__EVENTVALIDATION"]').get('value'),
        }
        ext = '|%s' % term
        content = scrapeutils.download(types[type]['url'], 'POST', data, ext)
        html = lxml.html.fromstring(content)

    # pick list items
    result = {
        'url': types[type]['url'],
        '_items': []
    }
    for li in html.findall('.//ul[@class="longlist"]//li'):
        a = li.find('a')
        group = {
            'id': re.search(r'(ID|SkupinaId)=(\d+)', a.get('href')).group(2),
            'názov': a.text,
        }
        line = li.text_content()
        info = re.search(group['názov'] + r'\s*(\((.+?) - (.+?)\))?\s*(\S.*)?$', line, re.DOTALL)
        if info:
            if info.group(2):
                group['od'] = info.group(2)
                group['do'] = info.group(3)
            if info.group(4):
                group['poznámka'] = info.group(4)
        result['_items'].append(group)

    return scrapeutils.plaintext(result)

Example #23

0

Show file

    }
    return name


# get all senators by districts and all political groups

baseurl = 'http://senat.cz/'
people = []
groups = {}
iid = 100001
for i in range(1, 82):
    print(i)
    url = "http://senat.cz/senat/volby/hledani/o_obvodu.php?ke_dni=23.02.2015&O=10&kod=" + str(
        i)
    domtree = html.fromstring(
        bytes(scrapeutils.download(url), 'iso-8859-1').decode('utf-8')
    )  #note: senat.cz incorrectly send headers as iso/latin1 and so requests save it incorrectly - so fixing it here
    tables = domtree.xpath('//table[@class="tHistory"]')
    for table in tables:
        image = baseurl[:-1] + table.xpath('tr/td/img/@src')[0].replace(
            '_110', '')
        name = full_name2name(table.xpath('tr/td/img/@alt')[0])
        ident = re.search('par_3=(\d{1,})',
                          table.xpath('tr/td/a/@href')[0]).group(1).strip()
        people.append({
            'given_name':
            name['given_name'],
            'family_name':
            name['family_name'],
            'name':
            name['name'],