def scrape(html, filter=r".*"): soup = BeautifulSoup(html) tbl = soup.find_all('tr') for tr in tbl[1:]: rec = TalkRecord() rec.series = "cornell-scan" rec.datetime = scrape_date(tr) if not re.match(filter, rec.date()): continue rec.speaker = scrape_speaker(tr) rec.title = scrape_title(tr) rec.url = scrape_url(tr) rec.blurb = scrape_blurb(rec.url) rec.write()
def scrape(html, filter=r".*"): soup = BeautifulSoup(html) dates = soup.find_all('h4') patt1 = re.compile("\s*[^:]*:(.*\(.*\))\s*-\s*([^\n]*)") for date in dates: rec = TalkRecord() rec.series = "cornell-cam" rec.datetime = scrape_date(date) if not re.match(filter, rec.date()): continue infos = date.find_next_sibling('h5') if not infos: continue m = patt1.match(infos.get_text().encode("utf8")) if not m: continue rec.speaker = m.group(1) rec.title = m.group(2) rec.url = infos.find('a').get('href') rec.blurb = scrape_blurb(rec.url) rec.write()
def scrape(html, filter=r".*"): soup = BeautifulSoup(html) tbl = soup.find('table', class_='listing') trs = tbl.find_all('tr') for tr in trs[1:]: tds = tr.find_all('td') rec = TalkRecord() rec.series = "nyu-nasc" rec.datetime = scrape_date(tds[0]) if not re.match(filter, rec.date()): continue rec.speaker = pack_ws(tds[1].get_text().encode("utf8")) rec.title = pack_ws(tds[2].get_text().encode("utf8")) link = tds[2].find('a') if link: rec.url = "http://www.cs.nyu.edu/{0}".format( pack_ws(link.get('href'))) rec.write()
def scrape(html, filter=r".*"): soup = BeautifulSoup(html) tbl = soup.find('table', class_='data') trs = tbl.find_all('tr') for j in range(1,len(trs)-1,2): tds1 = trs[j].find_all('td') # Date and speaker tds2 = trs[j+1].find_all('td') # Blank and abstract rec = TalkRecord() rec.series = "mit-amc" rec.datetime = scrape_date(tds1[0]) if len(tds1) < 2: continue rec.speaker = tds1[1].get_text().encode("utf8") if not re.match(filter, rec.date()): continue rec.title = pack_ws(tds2[0].get_text().encode("utf8")) if rec.title != '': rec.url = "{0}{1}".format(URL, tds2[0].find('a').get('href')) rec.write()