Exemple #1
0
def scrape_agenda(engine, wp, session):
    url = WEBTV_BASE % (session, wp)
    response, doc = _html(url, timeout=4.0)
    if doc is None:
        return False
    table = doc.find('//div[@class="meetingTable"]/table')
    if table is None:
        return False
    data = {'wp': wp, 'session': session}
    rows = table.findall('.//tr')
    for i, row in enumerate(rows):
        tds = row.findall('td')
        session_name = tds[0].xpath('string()').strip()
        if len(session_name):
            data['session_name'] = session_name
            bla, date = session_name.rsplit(' ', 1)
            data['session_url'] = url
            data['session_date'] = datetime.strptime(date,
                    "%d.%m.%Y").isoformat()
        anchor = tds[0].find('a')
        if anchor is not None:
            data['item_id'] = anchor.get('name')
            key, label = tds[1].xpath('string()').strip().split('\n', 1)
            data['item_key'] = key.strip().replace('TOP:', '').strip()
            data['item_label'] = label.strip()

            text = rows[i + 1].find('.//span[@class="hiddenTopText"]')
            data['item_description'] = text.xpath('string()').strip()
            scrape_speeches(engine, data.copy())
    return True
def scrape_index():
    for year in range(2009, datetime.now().year):
        index_url = INDEX % year
        response, doc = _html(index_url)
        for a in doc.findall('//a'):
            url = urlparse.urljoin(index_url, a.get('href'))
            if url.endswith('.pdf'):
                yield url
def scrape_index():
    for year in range(2009, datetime.now().year):
        index_url = INDEX % year
        response, doc = _html(index_url)
        for a in doc.findall('//a'):
            url = urlparse.urljoin(index_url , a.get('href'))
            if url.endswith('.pdf'):
                yield url
Exemple #4
0
def scrape_speeches(engine, data):
    url = WEBTV_SPEECHES % (data['wp'], data['session'], data['item_id'])
    response, doc = _html(url)
    rows = doc.findall('//tr')
    table = sl.get_table(engine, 'webtv')
    for i, row in enumerate(rows):
        if i % 4 != 0:
            continue
        data['speaker'] = row.xpath('string()').strip()
        if isinstance(data['speaker'], str):
            data['speaker'] = data['speaker'].encode('latin-1').decode('utf-8')
        data['speech_id'] = rows[i + 2].find('.//a').get('href').split('=')[-1]
        sl.upsert(engine, table, data, ['speech_id'])
Exemple #5
0
def scrape_index():
    response, doc = _html(EXTRAKT_INDEX, timeout=120.0)
    for result in doc.findall("//a[@class='linkIntern']"):
        yield urljoin(EXTRAKT_INDEX, result.get('href'))
Exemple #6
0
def scrape_index():
    for wp in app.config.get('WAHLPERIODEN'):
        index_url = EXTRAKT_INDEX % wp
        response, doc = _html(index_url, timeout=120.0)
        for result in doc.findall("//a[@class='linkIntern']"):
            yield urljoin(index_url, result.get('href'))
def scrape_index():
    for wp in app.config.get('WAHLPERIODEN'):
        index_url = EXTRAKT_INDEX % wp
        response, doc = _html(index_url, timeout=120.0)
        for result in doc.findall("//a[@class='linkIntern']"):
            yield urljoin(index_url, result.get('href'))