def scrape(html, filter=r".*"): soup = BeautifulSoup(html) tbl = soup.find_all('tr') for tr in tbl[1:]: rec = TalkRecord() rec.series = "cornell-scan" rec.datetime = scrape_date(tr) if not re.match(filter, rec.date()): continue rec.speaker = scrape_speaker(tr) rec.title = scrape_title(tr) rec.url = scrape_url(tr) rec.blurb = scrape_blurb(rec.url) rec.write()
def scrape(html, filter=r".*"): soup = BeautifulSoup(html) dates = soup.find_all('h4') patt1 = re.compile("\s*[^:]*:(.*\(.*\))\s*-\s*([^\n]*)") for date in dates: rec = TalkRecord() rec.series = "cornell-cam" rec.datetime = scrape_date(date) if not re.match(filter, rec.date()): continue infos = date.find_next_sibling('h5') if not infos: continue m = patt1.match(infos.get_text().encode("utf8")) if not m: continue rec.speaker = m.group(1) rec.title = m.group(2) rec.url = infos.find('a').get('href') rec.blurb = scrape_blurb(rec.url) rec.write()