def scrape(html, filter=r".*"):
    soup = BeautifulSoup(html)
    tbl = soup.find_all('tr')
    for tr in tbl[1:]:

        rec = TalkRecord()
        rec.series = "cornell-scan"
        rec.datetime = scrape_date(tr)
        if not re.match(filter, rec.date()):
            continue
        
        rec.speaker = scrape_speaker(tr)
        rec.title = scrape_title(tr)
        rec.url = scrape_url(tr)
        rec.blurb = scrape_blurb(rec.url)
        rec.write()
def scrape(html, filter=r".*"):
    soup = BeautifulSoup(html)
    dates = soup.find_all('h4')
    patt1 = re.compile("\s*[^:]*:(.*\(.*\))\s*-\s*([^\n]*)")
    for date in dates:
        rec = TalkRecord()
        rec.series = "cornell-cam"
        rec.datetime = scrape_date(date)
        if not re.match(filter, rec.date()):
            continue
        infos = date.find_next_sibling('h5')
        if not infos:
            continue
        m = patt1.match(infos.get_text().encode("utf8"))
        if not m:
            continue
        rec.speaker = m.group(1)
        rec.title = m.group(2)
        rec.url = infos.find('a').get('href')
        rec.blurb = scrape_blurb(rec.url)
        rec.write()
Exemple #3
0
def scrape(html, filter=r".*"):
    soup = BeautifulSoup(html)
    tbl = soup.find('table', class_='listing')
    trs = tbl.find_all('tr')
    for tr in trs[1:]:
        tds = tr.find_all('td')
        rec = TalkRecord()
        rec.series = "nyu-nasc"
        rec.datetime = scrape_date(tds[0])
        if not re.match(filter, rec.date()):
            continue
        rec.speaker = pack_ws(tds[1].get_text().encode("utf8"))
        rec.title = pack_ws(tds[2].get_text().encode("utf8"))
        link = tds[2].find('a')
        if link:
            rec.url = "http://www.cs.nyu.edu/{0}".format(
                pack_ws(link.get('href')))
        rec.write()
def scrape(html, filter=r".*"):
    soup = BeautifulSoup(html)
    tbl = soup.find('table', class_='data')
    trs = tbl.find_all('tr')
    for j in range(1,len(trs)-1,2):
        tds1 = trs[j].find_all('td')    # Date and speaker
        tds2 = trs[j+1].find_all('td')  # Blank and abstract
        rec = TalkRecord()
        rec.series = "mit-amc"
        rec.datetime = scrape_date(tds1[0])
        if len(tds1) < 2:
            continue
        rec.speaker = tds1[1].get_text().encode("utf8")
        if not re.match(filter, rec.date()):
            continue
        rec.title = pack_ws(tds2[0].get_text().encode("utf8"))
        if rec.title != '':
            rec.url = "{0}{1}".format(URL, tds2[0].find('a').get('href'))
            rec.write()