def add_presentation(url, category):
    print("Collecting from {}".format(url))
    xpath = '//div[contains(@class,"presentation")]/h3/a'
    entries = html.fromstring(requests.get(url).text).xpath(xpath)
    ## Iterate through and extract the relevant content
    for a in entries:
        title = a.text
        if 'canceled' in title.lower():
            continue
        root = html.fromstring(
            requests.get('https://us.pycon.org' + a.get('href')).text)
        speakers = root.xpath('//h4/a/text()')
        abstract = root.xpath('//div[@class="abstract"]')[0].text_content()
        try:
            level = root.xpath('//dl/dd/text()')[0]
        except ValueError:
            continue
        level = 'Beginner' if level == 'Novice' else level
        talk = Talk(category=category,
                    conference_id=conference.id,
                    title=title)
        data = db.TalkData(speakers, [], [])
        talk.abstract = abstract[:10000]
        talk.level = level
        db.add_talk(talk, **data._asdict())
def add_presentation_from_table(url, category):
    print("Collecting from {}".format(url))
    xpath = '//td[contains(@class,"slot")]'
    entries = html.fromstring(requests.get(url).text).xpath(xpath)
    ## Iterate through and extract the relevant content
    for td in entries:
        a = td.find('./span[@class="title"]/a')
        if a is None:
            print('skipping...')
            continue
        title = a.text
        abstract = a.get('title')
        if 'canceled' not in title.lower():
            if 'Jasmine Hsu' in title:
                speakers = title.split(',')
                title = 'Fire, bullets, and productivity'
                level = 'Beginner'
            else:
                speakers = td.findtext('./span[@class="speaker"]').strip()
                speakers = speakers.split(
                    ',') if ',' in speakers else speakers.split('&')
                speakers = [s for s in speakers if s.strip() and not '?' in s]
                level = td.xpath('./comment()')[0].text.splitlines()[1].strip()
                level = 'Beginner' if level == 'Novice' else level
            talk = Talk(category=category,
                        conference_id=conference.id,
                        title=title)
            data = db.TalkData(speakers, [], [])
            talk.abstract = abstract[:10000]
            talk.level = level
            db.add_talk(talk, **data._asdict())
def add_presentation_from_table(url, category):
    print("Collecting from {}".format(url))
    xpath = '//td[contains(@class,"slot")]'
    entries = html.fromstring(requests.get(url).text).xpath(xpath)
    ## Iterate through and extract the relevant content
    for td in entries:
        a = td.find('./span[@class="title"]/a')
        if a is None:
            continue
        title = a.text
        abstract = a.get('title')
        if 'canceled' not in title.lower():
            speakers = td.findtext('./span[@class="speaker"]').strip()
            speakers = speakers.split(
                ',') if ',' in speakers else speakers.split('&')
            level = td.findtext('./span[@class="audience_level"]').strip()
            level = 'Beginner' if level == 'Novice' else level
            talk = Talk(category=category,
                        conference_id=conference.id,
                        title=title)
            data = db.TalkData(speakers, [], [])
            talk.abstract = abstract[:10000]
            talk.level = level
            db.add_talk(talk, **data._asdict())
url = wayback + 'http://us.pycon.org:80/2009/conference/talks/'
#xpath = '//div[@class="proposal-list-summary"]/*'
xpath = '//*[contains(@class, "proposal_list_summary")]'
entries = html.fromstring(requests.get(url).text).xpath(xpath)
print('talks')
print(url)
for e in entries:
    rows = e.xpath('./*[self::h2 or self::span or self::div]')
    talk = Talk(category=Talk.TALK, conference_id=conference.id)
    data = db.TalkData([], [], [])
    talk.title = rows[0].text_content().split('\n')[1].strip()
    i = 1  # skip talk number (i=0)
    while i < len(rows):
        # people
        txt = rows[i].text_content().strip(';')
        if duration.match(txt):
            break
        else:
            data.speaker_names.append(txt.split('(')[0].split(' bio')[0])
            data.organization_names.extend(re.findall('(?<=\()[^\)]+(?=\))',txt))
            i += 1
    talk.level = txt.strip().split()[-1].lower()
    # keywords
    i += 2
    txt = rows[i].text_content().strip()
    data.topic_names.extend(txt.split(','))
    # abstract
    i += 1
    talk.abstract = rows[i].text_content().strip()
    db.add_talk(talk, **data._asdict())
Example #5
0
entries = html.fromstring(requests.get(url).text).xpath(xpath)
print('tutorials')
print(url)
## Iterate through and extract the relevant content
for e in entries:
    a = e.find('./h3/a')
    title = a.text
    root = html.fromstring(requests.get('https://us.pycon.org'+a.get('href')).text)
    abstract = root.xpath('//div[@class="abstract"]')[0].text_content()
    speakers = root.xpath('//h4/a/text()')
    level, category = root.xpath('//dl/dd/text()')
    level = 'Beginner' if level == 'Novice' else level
    talk = Talk(category=Talk.TUTORIAL, conference_id=conference.id, title=title)
    data = db.TalkData(speakers, [category], [])
    talk.abstract = abstract[:10000]
    talk.level = level
    db.add_talk(talk, **data._asdict())


url = 'https://us.pycon.org/2013/schedule/talks/list/'
xpath = '//div[contains(@class,"presentation")]/h3/a'
entries = html.fromstring(requests.get(url).text).xpath(xpath)
print('talks')
print(url)
# Iterate through and extract the relevant content
for a in entries:
    title = a.text
    if 'canceled' in title.lower():
        continue
    root = html.fromstring(requests.get('https://us.pycon.org'+a.get('href')).text)
    speakers = root.xpath('//h4/a/text()')
wayback = 'https://web.archive.org/web/20070213073856/'
url = wayback + 'http://us.pycon.org:80/apps07/talks/'
xpath = '//*[contains(@class, "proposal_list_summary")]/*[not(self::br)]'
entries = html.fromstring(requests.get(url).text).xpath(xpath)
print('talks')
print(url)
for e in entries:
    if e.tag == 'h2':
        if talk.title is not None:
            # Finished one
            db.add_talk(talk, **data._asdict())
            talk = Talk(category=Talk.TALK, conference_id=conference.id)
            data = db.TalkData([], [], [])
        talk.title = e.text_content().split('.', 1)[-1].strip()
    elif e.tag == 'div':
        talk.abstract = e.text_content().strip()
    else:  # span...
        tc = e.text_content()
        if tc.endswith('audio and materials)'):
            talk.level = tc.split()[1]
        elif tc.startswith('categories'):
            data.topic_names.extend(tc.split(':')[-1].split(','))
        else:  # Speaker names
            speaker = tc.strip('; ').split('(', 1)[0]
            data.speaker_names.extend(separators.split(speaker))
            data.organization_names.extend(org_matcher.findall(tc))

# don't forget the last one..
if talk.title is not None:
    db.add_talk(talk, **data._asdict())
entries = html.fromstring(requests.get(url).text).xpath(xpath)
print('talks')
print(url)
for e in entries:
    rows = e.xpath('./*[self::h2 or self::span or self::div]')
    talk = Talk(category=Talk.TALK, conference_id=conference.id)
    data = db.TalkData([], [], [])
    talk.title = rows[0].text_content().split('\n')[1].strip()
    i = 1  # skip talk number (i=0)
    while i < len(rows):
        # people
        txt = rows[i].text_content().strip(';')
        if duration.match(txt):
            break
        else:
            data.speaker_names.append(txt.split('(')[0].split(' bio')[0])
            data.organization_names.extend(
                re.findall('(?<=\()[^\)]+(?=\))', txt))
            i += 1
    talk.level = duration.sub('', txt).lower()
    # keywords
    i += 1
    txt = rows[i].text_content().strip()
    data.topic_names.extend(txt.split(','))
    # abstract
    i += 1
    talk.abstract = rows[i].text_content().strip()
    db.add_talk(talk, **data._asdict())
    print(talk)
    print(data)