Esempio n. 1
0
def crawler_FRB():
    html = urlopen(url_frb_2016)
    bsObj = BeautifulSoup(html, "html.parser")
    events_list_obj = bsObj.find("div", {
        "class": "row eventlist"
    }).find("div", {"class": "col-xs-12 col-sm-8 col-md-8"})
    event_rows_obj = events_list_obj.findAll("div", {"class": "row"})

    # news_list = list()

    with open(base_dir + "csv_frb.csv", "a") as fw:
        csvwriter = csv.writer(fw)
        csvwriter.writerow(["title", "href", "date", "type", "content"])
        for event_row_obj in event_rows_obj:
            try:
                news = News()
                date_obj = event_row_obj.find(
                    "div", {"class": "col-xs-3 col-md-2 eventlist__time"})
                news.date = date_obj.find("time").text
                event_obj = event_row_obj.find(
                    "div", {"class": "col-xs-9 col-md-10 eventlist__event"})
                news.href = url_domain_frb + event_obj.find("a").attrs['href']
                news.title = event_obj.find("p").find("a").find("em").text
                news.type = event_obj.find("p", {
                    "class": "eventlist__press"
                }).find("em").find("strong").text
                news.content = get_content(news.href)
                r = [news.title, news.href, news.date, news.type, news.content]
                csvwriter.writerow(r)
                # news_list.append(news)
            except:
                print("except..")
Esempio n. 2
0
def storyteller():
    form = ReusableForm(request.form)

    if request.method == "POST":
        if form.validate():
            email = request.form["email"]
            password = str(request.form["password"])
            login = firebase.login(email, password)

            if login == 0:
                news = News()
                news.title = request.form["title"]
                news.message = message_with_signature(request.form["message"],
                                                      email)
                news.url = request.form["url"]
                news.date = time.strftime("%Y-%m-%d")
                news.is_private = False

                firebase.fcm(news, True)
                print(news.message)
                flash("Messaggio inviato con successo")
            if login == 1:
                flash("Errore: nome utente o password errata")
            elif login == 2:
                flash("Errore: chiave API non definita")
            elif login == 3:
                flash("Errore: account non valido")
        else:
            flash("Compila tutti i campi")

    return render_template("storyteller.html", form=form)
Esempio n. 3
0
def crawler_PBOC():

    with open(news_list_indexes_file, "r") as fr:
        with open(out_file, "w") as fw:
            csvwriter = csv.writer(fw)
            csvwriter.writerow(["title", "href", "date", "content"])
            for index_url in fr.readlines():
                # print(index_url)
                html = urlopen(index_url)
                # print(html)
                bsObj = BeautifulSoup(html, "lxml")
                print(bsObj)

                news_objs = bsObj.find("div", {"class":"mainw950"})\
                    .find("div", {"opentype":"page"}).find("td", {"colspan":"2"})\
                    .find("div", {"id":"r_con"}).find("div", {"class":"portlet"})\
                    .find("div", {"style":"height:480px"}).find("table").find("td").findAll("table")
                # print(news_objs)
                # return
                for news_obj in news_objs:
                    try:
                        news = News()
                        news.date = news_obj.find("span", {"class": "hui12"})
                        news.href = url_domain_pboc + news_obj.find(
                            "a").attrs['href']
                        news.title = news_obj.find("a").text
                        news.content = getget_content(news.href)
                        r = [news.title, news.href, news.date, news.content]
                        csvwriter.writerow(r)
                    except:
                        print("except..")
Esempio n. 4
0
def getDB():
    news_list = []
    for i in mydb.list_collection_names():
        mycol = mydb.get_collection(i)
        nested_news_list = []
        for item in mycol.find():
            newNewsObject = News()
            newNewsObject.author = itemgetter("author")(item)
            newNewsObject.title = itemgetter("title")(item)
            newNewsObject.description = itemgetter("description")(item)
            newNewsObject.url = itemgetter("url")(item)
            newNewsObject.url_to_image = itemgetter("url_to_image")(item)
            newNewsObject.date_time_of_publishing = itemgetter(
                "date_time_of_publishing")(item)
            newNewsObject.id = itemgetter("id")(item)
            newNewsObject.name = itemgetter("name")(item)
            nested_news_list.append(newNewsObject)
        news_list.append(nested_news_list)
    return news_list
def keyword_search(keyword):

    source_url_list = select_source_url_returnset()

    url = 'http://www.toutiao.com/search_content/?offset=0&format=json&keyword= ' + keyword + '&autoload=true&count=200&cur_tab=1'

    toutiao_data = requests.get(url).text

    data = json.loads(toutiao_data)
    items = data['data']

    news_list = []
    link_head = 'http://toutiao.com'

    for n in items:
        if 'title' in n:
            news = News()
            news.title = n['title']
            news.tag = n['tag']
            news.source = n['source']
            news.source_url = link_head + n['source_url']
            # 两会关键词
            news.keyword = keyword
            # 今日头条自带关键词
            news.keywords = n['keywords']

            #如果已经存在source_url则跳过
            if news.source_url in source_url_list:
                print('数据库已有该记录!')
                continue

            print('新添加记录:', news.title)
            news_list.append(news)
            # print(news.title, news.source_url, news.source, news.keyword, news.keywords)

    return news_list