Example #1
0
def crawler_FRB():
    html = urlopen(url_frb_2016)
    bsObj = BeautifulSoup(html, "html.parser")
    events_list_obj = bsObj.find("div", {
        "class": "row eventlist"
    }).find("div", {"class": "col-xs-12 col-sm-8 col-md-8"})
    event_rows_obj = events_list_obj.findAll("div", {"class": "row"})

    # news_list = list()

    with open(base_dir + "csv_frb.csv", "a") as fw:
        csvwriter = csv.writer(fw)
        csvwriter.writerow(["title", "href", "date", "type", "content"])
        for event_row_obj in event_rows_obj:
            try:
                news = News()
                date_obj = event_row_obj.find(
                    "div", {"class": "col-xs-3 col-md-2 eventlist__time"})
                news.date = date_obj.find("time").text
                event_obj = event_row_obj.find(
                    "div", {"class": "col-xs-9 col-md-10 eventlist__event"})
                news.href = url_domain_frb + event_obj.find("a").attrs['href']
                news.title = event_obj.find("p").find("a").find("em").text
                news.type = event_obj.find("p", {
                    "class": "eventlist__press"
                }).find("em").find("strong").text
                news.content = get_content(news.href)
                r = [news.title, news.href, news.date, news.type, news.content]
                csvwriter.writerow(r)
                # news_list.append(news)
            except:
                print("except..")
Example #2
0
def crawler_PBOC():

    with open(news_list_indexes_file, "r") as fr:
        with open(out_file, "w") as fw:
            csvwriter = csv.writer(fw)
            csvwriter.writerow(["title", "href", "date", "content"])
            for index_url in fr.readlines():
                # print(index_url)
                html = urlopen(index_url)
                # print(html)
                bsObj = BeautifulSoup(html, "lxml")
                print(bsObj)

                news_objs = bsObj.find("div", {"class":"mainw950"})\
                    .find("div", {"opentype":"page"}).find("td", {"colspan":"2"})\
                    .find("div", {"id":"r_con"}).find("div", {"class":"portlet"})\
                    .find("div", {"style":"height:480px"}).find("table").find("td").findAll("table")
                # print(news_objs)
                # return
                for news_obj in news_objs:
                    try:
                        news = News()
                        news.date = news_obj.find("span", {"class": "hui12"})
                        news.href = url_domain_pboc + news_obj.find(
                            "a").attrs['href']
                        news.title = news_obj.find("a").text
                        news.content = getget_content(news.href)
                        r = [news.title, news.href, news.date, news.content]
                        csvwriter.writerow(r)
                    except:
                        print("except..")