def add_presentation(url, category): print("Collecting from {}".format(url)) xpath = '//div[contains(@class,"presentation")]/h3/a' entries = html.fromstring(requests.get(url).text).xpath(xpath) ## Iterate through and extract the relevant content for a in entries: title = a.text if 'canceled' in title.lower(): continue root = html.fromstring( requests.get('https://us.pycon.org' + a.get('href')).text) speakers = root.xpath('//h4/a/text()') abstract = root.xpath('//div[@class="abstract"]')[0].text_content() try: level = root.xpath('//dl/dd/text()')[0] except ValueError: continue level = 'Beginner' if level == 'Novice' else level talk = Talk(category=category, conference_id=conference.id, title=title) data = db.TalkData(speakers, [], []) talk.abstract = abstract[:10000] talk.level = level db.add_talk(talk, **data._asdict())
def add_presentation_from_table(url, category): print("Collecting from {}".format(url)) xpath = '//td[contains(@class,"slot")]' entries = html.fromstring(requests.get(url).text).xpath(xpath) ## Iterate through and extract the relevant content for td in entries: a = td.find('./span[@class="title"]/a') if a is None: print('skipping...') continue title = a.text abstract = a.get('title') if 'canceled' not in title.lower(): if 'Jasmine Hsu' in title: speakers = title.split(',') title = 'Fire, bullets, and productivity' level = 'Beginner' else: speakers = td.findtext('./span[@class="speaker"]').strip() speakers = speakers.split( ',') if ',' in speakers else speakers.split('&') speakers = [s for s in speakers if s.strip() and not '?' in s] level = td.xpath('./comment()')[0].text.splitlines()[1].strip() level = 'Beginner' if level == 'Novice' else level talk = Talk(category=category, conference_id=conference.id, title=title) data = db.TalkData(speakers, [], []) talk.abstract = abstract[:10000] talk.level = level db.add_talk(talk, **data._asdict())
def add_presentation(url, category): print("Collecting from {}".format(url)) xpath = '//div[contains(@class,"box-content")]/*' entries = html.fromstring(requests.get(url).text).xpath(xpath) first = next(i for i, e in enumerate(entries) if e.tag == 'h2') ## Iterate through and extract the relevant content for i in range(int((len(entries) - first) / 3)): h2, p, div = entries[first + 3 * i:first + 3 * (1 + i)] title = h2.text_content() if 'canceled' in title.lower(): continue speakers = p.text_content().strip('\n ').split('\n', 1)[0].split(',') speakers = [s for s in speakers if s.strip() and not '?' in s] abstract = div.text_content().strip() talk = Talk(category=category, conference_id=conference.id, title=title) data = db.TalkData(speakers, [], []) talk.abstract = abstract[:10000] db.add_talk(talk, **data._asdict())
def add_new_talk(title, abstract, speaker, topic): if abstract and "__wbhack.init('https://web.archive.org/web');" in abstract: abstract = abstract.split( "__wbhack.init('https://web.archive.org/web');", 1)[-1] db.add_talk( Talk(category=Talk.TALK, conference_id=conference.id, title=title, abstract=abstract), **db.TalkData([speaker], [topic], [])._asdict()) print("adding new one not in list:", speaker, title, "\n***\n")
def add_presentation_from_table(url, category): print("Collecting from {}".format(url)) xpath = '//td[contains(@class,"slot")]' entries = html.fromstring(requests.get(url).text).xpath(xpath) ## Iterate through and extract the relevant content for td in entries: a = td.find('./span[@class="title"]/a') if a is None: continue title = a.text abstract = a.get('title') if 'canceled' not in title.lower(): speakers = td.findtext('./span[@class="speaker"]').strip() speakers = speakers.split( ',') if ',' in speakers else speakers.split('&') level = td.findtext('./span[@class="audience_level"]').strip() level = 'Beginner' if level == 'Novice' else level talk = Talk(category=category, conference_id=conference.id, title=title) data = db.TalkData(speakers, [], []) talk.abstract = abstract[:10000] talk.level = level db.add_talk(talk, **data._asdict())
new_name = " ".join(name.strip().split()[:3]) volunteer = db.fetch_or_add(Human(name=new_name)) if conference not in volunteer.volunteering: volunteer.volunteering.append(conference) db.session.commit() ## Talks ## ~~~~~~ ## ## Keynotes keynotes = ( #(['Kelsey Hightower'], 'Kubernetes for Pythonistas', ['Google'], 'http://pyvideo.org/pycon-us-2017/keynote-kubernetes-for-pythonistas.html', ['voice', 'kubernetes', 'containers']), ) for speaker_names, title, org, url, topics in keynotes: talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) talk.title = title talk.video_url = url data = db.TalkData(speaker_names, topics, org) db.add_talk(talk, **data._asdict()) ## Tutorials, talks, and posters def add_presentation(url, category): print("Collecting from {}".format(url)) xpath = '//div[contains(@class,"box-content")]/*' entries = html.fromstring(requests.get(url).text).xpath(xpath) first = next(i for i, e in enumerate(entries) if e.tag == 'h2') ## Iterate through and extract the relevant content for i in range(int((len(entries) - first) / 3)): h2, p, div = entries[first + 3 * i:first + 3 * (1 + i)]
name = " ".join(name.strip().split()[:2]) if name and len(name) > 1: volunteer = db.fetch_or_add(Human(name=name)) if conference not in volunteer.volunteering: volunteer.volunteering.append(conference) db.session.commit() ## Talks keynotes = ( (['Guido van Rossum'], 'Update on the state of Python', None), (['Steve Huffman', 'Alexis Ohanian'], 'Reddit', "Reddit's origin and the switch to Python") ) for speaker_names, title, abstract in keynotes: talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) talk.title = title if title == 'Reddit': data.organization_names.append('Reddit') if abstract: talk.abstract = abstract data = db.TalkData(speaker_names, [], []) db.add_talk(talk, **data._asdict()) ## Tutorials ## ==> Ignore these...the links are broken and only the presenters' ## last names are given, so it is hard to create an entry. ## #wayback = 'https://web.archive.org/web/20090518174359/' #url = wayback + 'http://us.pycon.org:80/2009/tutorials/schedule'
db.session.commit() ## Talks ## ~~~~~~ ## ## Keynotes keynotes = ( (['Eben Upton'], 'The Raspberry Pi: providing children around the world the opportunity to learn programming', ['Raspberry Pi Foundation'], 'http://pyvideo.org/pycon-us-2013/keynote-2.html', ['education']), (['Raymond Hettinger'], 'What makes Python Awesome', [], 'http://pyvideo.org/pycon-us-2013/keynote-3.html', ['core']), #(['Jessica McKellar'], 'How the Internet works', [], 'http://pyvideo.org/pycon-us-2013/how-the-internet-works.html', ['web', 'twisted', 'scapy']), (['Guido van Rossum'], 'Announcing asyncio for the standard library (PEP 3156)', [], 'http://pyvideo.org/pycon-us-2013/keynote-1.html', ['concurrency', 'standard library']) ) for speaker_names, title, org, url, topics in keynotes: talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) talk.title = title talk.video_url = url data = db.TalkData(speaker_names, topics, org) db.add_talk(talk, **data._asdict()) ## Tutorials ## ==> Ignore these...the links are broken and only the presenters' ## last names are given, so it is hard to create an entry. ## url = 'https://us.pycon.org/2013/schedule/tutorials/list/' xpath = '//div[contains(@class,"presentation")]' entries = html.fromstring(requests.get(url).text).xpath(xpath) print('tutorials')
print('volunteers') print(url) for volunteer_name in html.fromstring(requests.get(url).text).xpath(xpath): volunteer_name = volunteer_name.text_content().strip() if len(volunteer_name) == 0: continue # There can be multiple comma-separated names. for name in volunteer_name.split(','): volunteer = db.fetch_or_add(Human(name=name)) if conference not in volunteer.volunteering: volunteer.volunteering.append(conference) db.session.commit() ## Talks talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) data = db.TalkData([], [], []) wayback = 'https://web.archive.org/web/20070207091801/' url = wayback + 'http://us.pycon.org:80/TX2007/Keynotes' xpath = '//div[@id="wikitext"]/*' entries = html.fromstring(requests.get(url).text).xpath(xpath) first_talk = next(i for i,e in enumerate(entries) if e.tag == 'h2') entries = entries[first_talk:-1] print('talks') print(url) for e in entries: if e.tag == 'h2': if talk.title is not None: # Finished one. db.add_talk(talk, **data._asdict()) data = db.TalkData([], [], [])
if '.' in new_name: new_name = " ".join(name.strip().split()[:3]) volunteer = db.fetch_or_add(Human(name=new_name)) if conference not in volunteer.volunteering: volunteer.volunteering.append(conference) db.session.commit() ## Talks ## ~~~~~~ ## ## Keynotes keynotes = ((['Hilary Mason'], 'Hello, PyCon'), (['Guido van Rossum'], 'A Fireside Chat with Guido van Rossum')) for speaker_names, title in keynotes: talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) talk.title = title data = db.TalkData(speaker_names, [], []) db.add_talk(talk, **data._asdict()) ## Startup Series talk = Talk(category=Talk.PLENARY, conference_id=conference.id) data = db.TalkData([], ['startup'], []) wayback = 'https://web.archive.org/web/20110316093256/' url = wayback + 'http://us.pycon.org:80/2011/home/keynotes/' xpath = '//div[@class="page"]/*' entries = html.fromstring(requests.get(url).text).xpath(xpath) first_talk = next(i for i, e in enumerate(entries) if e.tag == 'h1' and e.text.startswith('Startup')) entries = entries[first_talk + 1:] first_talk = next(i for i, e in enumerate(entries) if e.tag == 'h2')
xpath = '//td[@class="body"]/table/tr/td[1]/text()' entries = html.fromstring(requests.get(url).text).xpath(xpath) print('organizers') print(url) for name in entries: volunteer = db.fetch_or_add(Human(name=name)) if conference not in volunteer.volunteering: volunteer.volunteering.append(conference) print(volunteer) ## Talks # - Keynotes entries = (('Python Features', 'Guido van Rossum', 'Zope Corporation'), ('The Hundred Year Language', 'Paul Graham', None)) for title, speaker, org in entries: talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) data = db.TalkData([], [], []) talk.title = title data.speaker_names.append(speaker) if org is not None: data.organization_names.append(org) db.add_talk(talk, **data._asdict()) # - Tutorials # None? -- False; there are some intermixed with the talks; # search for 'tutorial' in the title/abstract # - Regular talks url = 'https://wiki.python.org/moin/PyConDC2003/Speakers' xpath = '//div[@id="content"]/*[not(self::span)]' entries = html.fromstring(requests.get(url).text).xpath(xpath)
print('volunteers') print(url) for volunteer_name in html.fromstring(requests.get(url).text).xpath(xpath): volunteer_name = volunteer_name.text_content().strip() if len(volunteer_name) == 0: continue # There can be multiple comma-separated names. for name in volunteer_name.split(','): volunteer = db.fetch_or_add(Human(name=name)) if conference not in volunteer.volunteering: volunteer.volunteering.append(conference) db.session.commit() ## Talks talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) data = db.TalkData([], [], []) wayback = 'https://web.archive.org/web/20070207091801/' url = wayback + 'http://us.pycon.org:80/TX2007/Keynotes' xpath = '//div[@id="wikitext"]/*' entries = html.fromstring(requests.get(url).text).xpath(xpath) first_talk = find_first(entries, lambda e: e.tag == 'h2') #first_talk = next(i for i,e in enumerate(entries) if e.tag == 'h2') entries = entries[first_talk:-1] print('talks') print(url) for e in entries: if e.tag == 'h2': if talk.title is not None: # Finished one. db.add_talk(talk, **data._asdict())
name = " ".join(name.strip().split()[:2]) volunteer = db.fetch_or_add(Human(name=name)) if conference not in volunteer.volunteering: volunteer.volunteering.append(conference) db.session.commit() ## Talks wayback = 'https://web.archive.org/web/20080907065646/' url = wayback + 'http://us.pycon.org/2008/conference/keynotes/' xpath = '//div[@id="keynote-talks"]/div[@class="section"]' entries = html.fromstring(requests.get(url).text).xpath(xpath) print('talks') print(url) for e in entries: talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) data = db.TalkData([], [], []) data.speaker_names.append(e.findtext('h1')) # Split off the abstract, and remove the 'Topic:' prefix tmp = e.xpath('*[text()[contains(.,"Topic")]]') if len(tmp) == 0: talk.title = "Keynote" else: tmp = re.split('[(:]', tmp[0].text_content()[7:].strip(')')) talk.title = tmp[0].strip() talk.abstract = ' '.join(tt for t in tmp[1:] for tt in t.split('\n')) db.add_talk(talk, **data._asdict()) # Tutorials wayback = 'https://web.archive.org/web/20090202113211/' url = wayback + 'http://us.pycon.org:80/2008/tutorials/schedule/'
print('organizers') print(url) for name in entries: volunteer = db.fetch_or_add(Human(name=name)) if conference not in volunteer.volunteering: volunteer.volunteering.append(conference) ## Talks # - Keynotes entries = (('The virtues of Open Source', 'Mitch Kapor', 'Open Source Applications Foundation (OSAF)'), ('Python State of the Union', 'Guido van Rossum', 'Zope corporation'), ('How to argue about typing', 'Bruce Eckel', None)) for title, speaker, org in entries: talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) data = db.TalkData([], [], []) talk.title = title data.speaker_names.append(speaker) if org is not None: data.organization_names.append(org) db.add_talk(talk, **data._asdict()) # - Tutorials # None? -- True. (PostMortem recommends tutorials for 2005) # - Regular talks #wayback = 'https://web.archive.org/web/20050206212138/' #url = wayback + 'http://www.python.org:80/pycon/dc2004/schedule.html url = 'https://wiki.python.org/moin/PyConDC2004/TalksByCategory' #xpath = '//td[@class="body"]/table/tbody/tr'