def add_presentation(url, category): print("Collecting from {}".format(url)) xpath = '//div[contains(@class,"presentation")]/h3/a' entries = html.fromstring(requests.get(url).text).xpath(xpath) ## Iterate through and extract the relevant content for a in entries: title = a.text if 'canceled' in title.lower(): continue root = html.fromstring( requests.get('https://us.pycon.org' + a.get('href')).text) speakers = root.xpath('//h4/a/text()') abstract = root.xpath('//div[@class="abstract"]')[0].text_content() try: level = root.xpath('//dl/dd/text()')[0] except ValueError: continue level = 'Beginner' if level == 'Novice' else level talk = Talk(category=category, conference_id=conference.id, title=title) data = db.TalkData(speakers, [], []) talk.abstract = abstract[:10000] talk.level = level db.add_talk(talk, **data._asdict())
def add_presentation_from_table(url, category): print("Collecting from {}".format(url)) xpath = '//td[contains(@class,"slot")]' entries = html.fromstring(requests.get(url).text).xpath(xpath) ## Iterate through and extract the relevant content for td in entries: a = td.find('./span[@class="title"]/a') if a is None: print('skipping...') continue title = a.text abstract = a.get('title') if 'canceled' not in title.lower(): if 'Jasmine Hsu' in title: speakers = title.split(',') title = 'Fire, bullets, and productivity' level = 'Beginner' else: speakers = td.findtext('./span[@class="speaker"]').strip() speakers = speakers.split( ',') if ',' in speakers else speakers.split('&') speakers = [s for s in speakers if s.strip() and not '?' in s] level = td.xpath('./comment()')[0].text.splitlines()[1].strip() level = 'Beginner' if level == 'Novice' else level talk = Talk(category=category, conference_id=conference.id, title=title) data = db.TalkData(speakers, [], []) talk.abstract = abstract[:10000] talk.level = level db.add_talk(talk, **data._asdict())
def add_presentation_from_table(url, category): print("Collecting from {}".format(url)) xpath = '//td[contains(@class,"slot")]' entries = html.fromstring(requests.get(url).text).xpath(xpath) ## Iterate through and extract the relevant content for td in entries: a = td.find('./span[@class="title"]/a') if a is None: continue title = a.text abstract = a.get('title') if 'canceled' not in title.lower(): speakers = td.findtext('./span[@class="speaker"]').strip() speakers = speakers.split( ',') if ',' in speakers else speakers.split('&') level = td.findtext('./span[@class="audience_level"]').strip() level = 'Beginner' if level == 'Novice' else level talk = Talk(category=category, conference_id=conference.id, title=title) data = db.TalkData(speakers, [], []) talk.abstract = abstract[:10000] talk.level = level db.add_talk(talk, **data._asdict())
url = wayback + 'http://us.pycon.org:80/2009/conference/talks/' #xpath = '//div[@class="proposal-list-summary"]/*' xpath = '//*[contains(@class, "proposal_list_summary")]' entries = html.fromstring(requests.get(url).text).xpath(xpath) print('talks') print(url) for e in entries: rows = e.xpath('./*[self::h2 or self::span or self::div]') talk = Talk(category=Talk.TALK, conference_id=conference.id) data = db.TalkData([], [], []) talk.title = rows[0].text_content().split('\n')[1].strip() i = 1 # skip talk number (i=0) while i < len(rows): # people txt = rows[i].text_content().strip(';') if duration.match(txt): break else: data.speaker_names.append(txt.split('(')[0].split(' bio')[0]) data.organization_names.extend(re.findall('(?<=\()[^\)]+(?=\))',txt)) i += 1 talk.level = txt.strip().split()[-1].lower() # keywords i += 2 txt = rows[i].text_content().strip() data.topic_names.extend(txt.split(',')) # abstract i += 1 talk.abstract = rows[i].text_content().strip() db.add_talk(talk, **data._asdict())
entries = html.fromstring(requests.get(url).text).xpath(xpath) print('tutorials') print(url) ## Iterate through and extract the relevant content for e in entries: a = e.find('./h3/a') title = a.text root = html.fromstring(requests.get('https://us.pycon.org'+a.get('href')).text) abstract = root.xpath('//div[@class="abstract"]')[0].text_content() speakers = root.xpath('//h4/a/text()') level, category = root.xpath('//dl/dd/text()') level = 'Beginner' if level == 'Novice' else level talk = Talk(category=Talk.TUTORIAL, conference_id=conference.id, title=title) data = db.TalkData(speakers, [category], []) talk.abstract = abstract[:10000] talk.level = level db.add_talk(talk, **data._asdict()) url = 'https://us.pycon.org/2013/schedule/talks/list/' xpath = '//div[contains(@class,"presentation")]/h3/a' entries = html.fromstring(requests.get(url).text).xpath(xpath) print('talks') print(url) # Iterate through and extract the relevant content for a in entries: title = a.text if 'canceled' in title.lower(): continue root = html.fromstring(requests.get('https://us.pycon.org'+a.get('href')).text) speakers = root.xpath('//h4/a/text()')
wayback = 'https://web.archive.org/web/20070213073856/' url = wayback + 'http://us.pycon.org:80/apps07/talks/' xpath = '//*[contains(@class, "proposal_list_summary")]/*[not(self::br)]' entries = html.fromstring(requests.get(url).text).xpath(xpath) print('talks') print(url) for e in entries: if e.tag == 'h2': if talk.title is not None: # Finished one db.add_talk(talk, **data._asdict()) talk = Talk(category=Talk.TALK, conference_id=conference.id) data = db.TalkData([], [], []) talk.title = e.text_content().split('.', 1)[-1].strip() elif e.tag == 'div': talk.abstract = e.text_content().strip() else: # span... tc = e.text_content() if tc.endswith('audio and materials)'): talk.level = tc.split()[1] elif tc.startswith('categories'): data.topic_names.extend(tc.split(':')[-1].split(',')) else: # Speaker names speaker = tc.strip('; ').split('(', 1)[0] data.speaker_names.extend(separators.split(speaker)) data.organization_names.extend(org_matcher.findall(tc)) # don't forget the last one.. if talk.title is not None: db.add_talk(talk, **data._asdict())
entries = html.fromstring(requests.get(url).text).xpath(xpath) print('talks') print(url) for e in entries: rows = e.xpath('./*[self::h2 or self::span or self::div]') talk = Talk(category=Talk.TALK, conference_id=conference.id) data = db.TalkData([], [], []) talk.title = rows[0].text_content().split('\n')[1].strip() i = 1 # skip talk number (i=0) while i < len(rows): # people txt = rows[i].text_content().strip(';') if duration.match(txt): break else: data.speaker_names.append(txt.split('(')[0].split(' bio')[0]) data.organization_names.extend( re.findall('(?<=\()[^\)]+(?=\))', txt)) i += 1 talk.level = duration.sub('', txt).lower() # keywords i += 1 txt = rows[i].text_content().strip() data.topic_names.extend(txt.split(',')) # abstract i += 1 talk.abstract = rows[i].text_content().strip() db.add_talk(talk, **data._asdict()) print(talk) print(data)