Exemple #1
0
def scrape(html, filter=r".*"):
    soup = BeautifulSoup(html)
    tbl = soup.find('table', class_='data')
    trs = tbl.find_all('tr')
    for j in range(1,len(trs)-1,2):
        tds1 = trs[j].find_all('td')    # Date and speaker
        tds2 = trs[j+1].find_all('td')  # Blank and abstract
        rec = TalkRecord()
        rec.series = "mit-amc"
        rec.datetime = scrape_date(tds1[0])
        if len(tds2) < 2:
            continue
        rec.speaker = tds1[1].get_text().encode("utf8")
        if not re.match(filter, rec.date()):
            continue
        rec.title = pack_ws(tds2[1].get_text().encode("utf8"))
        rec.url = "{0}{1}".format(URL, tds2[1].find('a').get('href'))
        if rec.title != '':
            rec.writes()