volunteer = db.fetch_or_add(Human(name=new_name)) if conference not in volunteer.volunteering: volunteer.volunteering.append(conference) db.session.commit() ## Talks ## ~~~~~~ ## ## Keynotes keynotes = ( #(['Kelsey Hightower'], 'Kubernetes for Pythonistas', ['Google'], 'http://pyvideo.org/pycon-us-2017/keynote-kubernetes-for-pythonistas.html', ['voice', 'kubernetes', 'containers']), ) for speaker_names, title, org, url, topics in keynotes: talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) talk.title = title talk.video_url = url data = db.TalkData(speaker_names, topics, org) db.add_talk(talk, **data._asdict()) ## Tutorials, talks, and posters def add_presentation(url, category): print("Collecting from {}".format(url)) xpath = '//div[contains(@class,"box-content")]/*' entries = html.fromstring(requests.get(url).text).xpath(xpath) first = next(i for i, e in enumerate(entries) if e.tag == 'h2') ## Iterate through and extract the relevant content for i in range(int((len(entries) - first) / 3)): h2, p, div = entries[first + 3 * i:first + 3 * (1 + i)] title = h2.text_content()
if name and len(name) > 1: volunteer = db.fetch_or_add(Human(name=name)) if conference not in volunteer.volunteering: volunteer.volunteering.append(conference) db.session.commit() ## Talks keynotes = ( (['Guido van Rossum'], 'Update on the state of Python', None), (['Steve Huffman', 'Alexis Ohanian'], 'Reddit', "Reddit's origin and the switch to Python") ) for speaker_names, title, abstract in keynotes: talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) talk.title = title if title == 'Reddit': data.organization_names.append('Reddit') if abstract: talk.abstract = abstract data = db.TalkData(speaker_names, [], []) db.add_talk(talk, **data._asdict()) ## Tutorials ## ==> Ignore these...the links are broken and only the presenters' ## last names are given, so it is hard to create an entry. ## #wayback = 'https://web.archive.org/web/20090518174359/' #url = wayback + 'http://us.pycon.org:80/2009/tutorials/schedule' #xpath = '//div[@id="tutorials"]//li'
new_name = " ".join(name.strip().split()[:3]) volunteer = db.fetch_or_add(Human(name=new_name)) if conference not in volunteer.volunteering: volunteer.volunteering.append(conference) db.session.commit() ## Talks ## ~~~~~~ ## ## Keynotes keynotes = ((['Hilary Mason'], 'Hello, PyCon'), (['Guido van Rossum'], 'A Fireside Chat with Guido van Rossum')) for speaker_names, title in keynotes: talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) talk.title = title data = db.TalkData(speaker_names, [], []) db.add_talk(talk, **data._asdict()) ## Startup Series talk = Talk(category=Talk.PLENARY, conference_id=conference.id) data = db.TalkData([], ['startup'], []) wayback = 'https://web.archive.org/web/20110316093256/' url = wayback + 'http://us.pycon.org:80/2011/home/keynotes/' xpath = '//div[@class="page"]/*' entries = html.fromstring(requests.get(url).text).xpath(xpath) first_talk = next(i for i, e in enumerate(entries) if e.tag == 'h1' and e.text.startswith('Startup')) entries = entries[first_talk + 1:] first_talk = next(i for i, e in enumerate(entries) if e.tag == 'h2') i = first_talk
entries = html.fromstring(requests.get(url).text).xpath(xpath) first_talk = next(i for i,e in enumerate(entries) if e.tag == 'h2') entries = entries[first_talk:-1] print('talks') print(url) for e in entries: if e.tag == 'h2': if talk.title is not None: # Finished one. db.add_talk(talk, **data._asdict()) data = db.TalkData([], [], []) talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) speaker = e.text_content().split('(')[0].strip() data.speaker_names.extend(separators.split(speaker)) elif e.tag == 'p' and e.text_content().startswith('Topic'): talk.title = e.text_content().split(' ', 1)[-1].strip().strip('"') # don't forget the last one.. if talk.title is not None: db.add_talk(talk, **data._asdict()) # Tutorials talk = Talk(category=Talk.TUTORIAL, conference_id=conference.id) data = db.TalkData([], [], []) wayback = 'https://web.archive.org/web/20070205022526/' url = wayback + 'http://us.pycon.org:80/TX2007/Tutorials' xpath = '//div[@id="wikitext"]/*' entries = html.fromstring(requests.get(url).text).xpath(xpath) header = next(i for i,e in enumerate(entries) if e.tag == 'h2') entries = entries[header+1:-1]
## Talks wayback = 'https://web.archive.org/web/20080907065646/' url = wayback + 'http://us.pycon.org/2008/conference/keynotes/' xpath = '//div[@id="keynote-talks"]/div[@class="section"]' entries = html.fromstring(requests.get(url).text).xpath(xpath) print('talks') print(url) for e in entries: talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) data = db.TalkData([], [], []) data.speaker_names.append(e.findtext('h1')) # Split off the abstract, and remove the 'Topic:' prefix tmp = e.xpath('*[text()[contains(.,"Topic")]]') if len(tmp) == 0: talk.title = "Keynote" else: tmp = re.split('[(:]', tmp[0].text_content()[7:].strip(')')) talk.title = tmp[0].strip() talk.abstract = ' '.join(tt for t in tmp[1:] for tt in t.split('\n')) db.add_talk(talk, **data._asdict()) # Tutorials wayback = 'https://web.archive.org/web/20090202113211/' url = wayback + 'http://us.pycon.org:80/2008/tutorials/schedule/' xpath = '//div[@id="content"]//li' entries = html.fromstring(requests.get(url).text).xpath(xpath) # Iterate through and extract the relevant content print('tutorials') print(url) for e in entries: