def add_presentation_from_table(url, category): print("Collecting from {}".format(url)) xpath = '//td[contains(@class,"slot")]' entries = html.fromstring(requests.get(url).text).xpath(xpath) ## Iterate through and extract the relevant content for td in entries: a = td.find('./span[@class="title"]/a') if a is None: print('skipping...') continue title = a.text abstract = a.get('title') if 'canceled' not in title.lower(): if 'Jasmine Hsu' in title: speakers = title.split(',') title = 'Fire, bullets, and productivity' level = 'Beginner' else: speakers = td.findtext('./span[@class="speaker"]').strip() speakers = speakers.split( ',') if ',' in speakers else speakers.split('&') speakers = [s for s in speakers if s.strip() and not '?' in s] level = td.xpath('./comment()')[0].text.splitlines()[1].strip() level = 'Beginner' if level == 'Novice' else level talk = Talk(category=category, conference_id=conference.id, title=title) data = db.TalkData(speakers, [], []) talk.abstract = abstract[:10000] talk.level = level db.add_talk(talk, **data._asdict())
def add_presentation(url, category): print("Collecting from {}".format(url)) xpath = '//div[contains(@class,"presentation")]/h3/a' entries = html.fromstring(requests.get(url).text).xpath(xpath) ## Iterate through and extract the relevant content for a in entries: title = a.text if 'canceled' in title.lower(): continue root = html.fromstring( requests.get('https://us.pycon.org' + a.get('href')).text) speakers = root.xpath('//h4/a/text()') abstract = root.xpath('//div[@class="abstract"]')[0].text_content() try: level = root.xpath('//dl/dd/text()')[0] except ValueError: continue level = 'Beginner' if level == 'Novice' else level talk = Talk(category=category, conference_id=conference.id, title=title) data = db.TalkData(speakers, [], []) talk.abstract = abstract[:10000] talk.level = level db.add_talk(talk, **data._asdict())
def add_new_talk(title, abstract, speaker, topic): if abstract and "__wbhack.init('https://web.archive.org/web');" in abstract: abstract = abstract.split( "__wbhack.init('https://web.archive.org/web');", 1)[-1] db.add_talk( Talk(category=Talk.TALK, conference_id=conference.id, title=title, abstract=abstract), **db.TalkData([speaker], [topic], [])._asdict()) print("adding new one not in list:", speaker, title, "\n***\n")
def add_presentation(url, category): print("Collecting from {}".format(url)) xpath = '//div[contains(@class,"box-content")]/*' entries = html.fromstring(requests.get(url).text).xpath(xpath) first = next(i for i, e in enumerate(entries) if e.tag == 'h2') ## Iterate through and extract the relevant content for i in range(int((len(entries) - first) / 3)): h2, p, div = entries[first + 3 * i:first + 3 * (1 + i)] title = h2.text_content() if 'canceled' in title.lower(): continue speakers = p.text_content().strip('\n ').split('\n', 1)[0].split(',') speakers = [s for s in speakers if s.strip() and not '?' in s] abstract = div.text_content().strip() talk = Talk(category=category, conference_id=conference.id, title=title) data = db.TalkData(speakers, [], []) talk.abstract = abstract[:10000] db.add_talk(talk, **data._asdict())
def add_presentation_from_table(url, category): print("Collecting from {}".format(url)) xpath = '//td[contains(@class,"slot")]' entries = html.fromstring(requests.get(url).text).xpath(xpath) ## Iterate through and extract the relevant content for td in entries: a = td.find('./span[@class="title"]/a') if a is None: continue title = a.text abstract = a.get('title') if 'canceled' not in title.lower(): speakers = td.findtext('./span[@class="speaker"]').strip() speakers = speakers.split( ',') if ',' in speakers else speakers.split('&') level = td.findtext('./span[@class="audience_level"]').strip() level = 'Beginner' if level == 'Novice' else level talk = Talk(category=category, conference_id=conference.id, title=title) data = db.TalkData(speakers, [], []) talk.abstract = abstract[:10000] talk.level = level db.add_talk(talk, **data._asdict())
db.session.commit() ## Talks ## ~~~~~~ ## ## Keynotes keynotes = ( #(['Kelsey Hightower'], 'Kubernetes for Pythonistas', ['Google'], 'http://pyvideo.org/pycon-us-2017/keynote-kubernetes-for-pythonistas.html', ['voice', 'kubernetes', 'containers']), ) for speaker_names, title, org, url, topics in keynotes: talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) talk.title = title talk.video_url = url data = db.TalkData(speaker_names, topics, org) db.add_talk(talk, **data._asdict()) ## Tutorials, talks, and posters def add_presentation(url, category): print("Collecting from {}".format(url)) xpath = '//div[contains(@class,"box-content")]/*' entries = html.fromstring(requests.get(url).text).xpath(xpath) first = next(i for i, e in enumerate(entries) if e.tag == 'h2') ## Iterate through and extract the relevant content for i in range(int((len(entries) - first) / 3)): h2, p, div = entries[first + 3 * i:first + 3 * (1 + i)] title = h2.text_content() if 'canceled' in title.lower(): continue speakers = p.text_content().strip('\n ').split('\n', 1)[0].split(',')
## Talks # - Keynotes entries = (('The virtues of Open Source', 'Mitch Kapor', 'Open Source Applications Foundation (OSAF)'), ('Python State of the Union', 'Guido van Rossum', 'Zope corporation'), ('How to argue about typing', 'Bruce Eckel', None)) for title, speaker, org in entries: talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) data = db.TalkData([], [], []) talk.title = title data.speaker_names.append(speaker) if org is not None: data.organization_names.append(org) db.add_talk(talk, **data._asdict()) # - Tutorials # None? -- True. (PostMortem recommends tutorials for 2005) # - Regular talks #wayback = 'https://web.archive.org/web/20050206212138/' #url = wayback + 'http://www.python.org:80/pycon/dc2004/schedule.html url = 'https://wiki.python.org/moin/PyConDC2004/TalksByCategory' #xpath = '//td[@class="body"]/table/tbody/tr' xpath = '//div[@id="content"]/*' entries = html.fromstring(requests.get(url).text).xpath(xpath) header = next(i for i, e in enumerate(entries) if e.tag == 'h1') entries = entries[header:] speaker_lookup = {} topic = None