def test_news_perm():
    n1 = News("asdf", 0)
    n2 = News("asdf_23", 0)
    n3 = News("asdf_", 1)
    n4 = News("asdf_23", 1)

    nv1 = NewsVector()
    nv1.add(n1)
    nv1.add(n2)
    nv1.label = n1.label
    nv2 = NewsVector()
    nv2.add(n3)
    nv2.add(n4)
    nv2.label = n3.label

    news_vecs = [[nv1, nv1], [nv2, nv2]] * 10
    num_agents = 2

    X, y, union = helpers.get_feature_vectors(news_vecs, num_agents)
    classifier, y_pred, y_true = ml.train_and_test(X, y, verbose=True)

    test_accuracy = (y_pred == y_true).sum() / sum(map(len, y_pred))
    print(f"Test acc: {test_accuracy}")
    test_stat = correctly_classified
    p_value = permutation_test.blocked_sampled_test(y_pred, y_true, test_stat)
    return p_value
Esempio n. 2
0
def fetch_penpai_news():
    news_list = []  # 新闻列表
    # 提取首页的新闻数据
    index_resp = r.get(penpai_url).text
    index_html = etree.HTML(index_resp)
    news_urls = index_html.xpath(
        '//div[@class="news_li"]/div[@class="news_tu"]/a')  # 新闻链接
    imgs_urls = index_html.xpath(
        '//div[@class="news_li"]/div[@class="news_tu"]/a/img')  # 新闻图片
    overviews = index_html.xpath('//div[@class="news_li"]/p')  # 新闻简介
    times = index_html.xpath('//div[@class="pdtt_trbs"]/span[1]')  # 新闻时间
    origins = index_html.xpath('//div[@class="pdtt_trbs"]/a')  # 新闻来源
    for i in range(0, int(len(news_urls) / 2)):
        news_list.append(
            News(_id=news_urls[i].get('href').split('_')[-1],
                 title=imgs_urls[i].get('alt'),
                 overview=overviews[i].text.replace('\n', '').replace(' ', ''),
                 url=penpai_url + news_urls[i].get('href'),
                 image='http:' + imgs_urls[i].get('src'),
                 publish_time=times[i].text,
                 origin=origins[i].text).to_dict())
    # 正则提取topCids
    topCids = ''
    ids = cids_pattern.search(index_resp)
    if topCids is not None:
        topCids = ids.group(1)
    # 设置Ajax请求头
    ajax_params = {
        'nodeids': 25949,
        'topCids': '2840959,2840504,2840804,2841177,',
    }
    pageidx = 2
    while True:
        ajax_params['pageidx'] = pageidx
        ajax_params['lastTime'] = int(round(time.time() * 1000))
        resp = r.get(penpai_ajax_url,
                     params=ajax_params,
                     headers=penpai_headers)
        resp_content = resp.text
        print("爬取:", resp.url)
        results = news_pattern.findall(resp_content)
        for result in results:
            if '小时前' in result[5]:
                hours_before = hours_pattern.search(result[5])
                if hours_before is not None:
                    if int(hours_before.group(1)) > 12:
                        return news_list
                    else:
                        news_list.append(
                            News(_id=result[0].split('_')[-1],
                                 title=result[2],
                                 overview=result[3].replace('\n', '').replace(
                                     ' ', ''),
                                 url=penpai_url + result[0],
                                 image='http:' + result[1],
                                 publish_time=result[5],
                                 origin=result[4]).to_dict())
        pageidx += 1
        time.sleep(random.randint(0, 2))
Esempio n. 3
0
    def get_domain_news(self, domain, now_time):
        '''
            domain: str, the domain of source (e.g. 'bbc.com')
            now_time: Datetime, the current datetime in iso form

            return:
                domain_news_result: list<News> The news from the last query until now
        '''
        overall_count, first_page_content = self.get_domain_news_count(
            domain, now_time)

        domain_result = []
        domain_result.extend(first_page_content)
        page_number = math.ceil(overall_count /
                                20)  #Get the total number of pages

        if page_number >= 2:
            for page in range(2, min(page_number + 1, 5)):
                page_result = self.get_domain_news_at_page_n(
                    domain, now_time, page)
                domain_result.extend(page_result)
        domain_news_result = []
        for r in domain_result:
            news_r = News(domain, r["title"], r["description"],
                          r["publishedAt"], r["urlToImage"], r["url"])
            domain_news_result.append(news_r)

        return domain_news_result
Esempio n. 4
0
def dump_clusters():

    args = get_args()
    if args['-train'] == '':
        args['-train'] = 'src/resources/output' + args['-k']
    w2vobj = W2V(args['-input'], args['-train'], args['-k'])

    news = News()
    articles = news.get_articles()
    w2vobj.train()
    # Sentence vectorization by averaging
    article_vecs = [w2vobj.get_sentence_vector_avg(article['cleaned_title']) for article in articles]

    # Sentence vectorization by "newtonian" method
    '''article_vecs = []
    for article in articles:
        newtonian_vec = w2vobj.get_sentence_vector_newtonian(article['cleaned_title'])
        if newtonian_vec is not None:
            article_vecs.append(newtonian_vec)'''

    cluster_obj = Clustering(article_vecs, w2vobj)
    r_conn = redis.from_url(os.getenv('REDIS_URL',"redis://localhost:6379/"))

    if args['-cluster'] == 'agg':
        if args['-prune'] == 'true' or args['-prune'] == 'True':
            utilities.redis_kmeans_clusters(cluster_obj, articles, True, int(args['-limit']), r_conn)
            print("redis dump complete")
        else:
            utilities.redis_kmeans_clusters(cluster_obj, articles, False, int(args['-limit']), r_conn)
            print("redis dump complete")
    else:
        #TODO dump to redis
        utilties.print_ann_clusters(cluster_obj, articles)
Esempio n. 5
0
def storyteller():
    form = ReusableForm(request.form)

    if request.method == "POST":
        if form.validate():
            email = request.form["email"]
            password = str(request.form["password"])
            login = firebase.login(email, password)

            if login == 0:
                news = News()
                news.title = request.form["title"]
                news.message = message_with_signature(request.form["message"],
                                                      email)
                news.url = request.form["url"]
                news.date = time.strftime("%Y-%m-%d")
                news.is_private = False

                firebase.fcm(news, True)
                print(news.message)
                flash("Messaggio inviato con successo")
            if login == 1:
                flash("Errore: nome utente o password errata")
            elif login == 2:
                flash("Errore: chiave API non definita")
            elif login == 3:
                flash("Errore: account non valido")
        else:
            flash("Compila tutti i campi")

    return render_template("storyteller.html", form=form)
def news_test(companies):
    t = News(
        companies,
        output_root=r'C:\Users\zleirdahl\Desktop\PythonScripts\iex\Data\News\\',
        header_fields=['Date', 'Headline', 'Source', 'URL', 'Summary'],
        file_suffix='news')
    t.run()
Esempio n. 7
0
def crawler_PBOC():

    with open(news_list_indexes_file, "r") as fr:
        with open(out_file, "w") as fw:
            csvwriter = csv.writer(fw)
            csvwriter.writerow(["title", "href", "date", "content"])
            for index_url in fr.readlines():
                # print(index_url)
                html = urlopen(index_url)
                # print(html)
                bsObj = BeautifulSoup(html, "lxml")
                print(bsObj)

                news_objs = bsObj.find("div", {"class":"mainw950"})\
                    .find("div", {"opentype":"page"}).find("td", {"colspan":"2"})\
                    .find("div", {"id":"r_con"}).find("div", {"class":"portlet"})\
                    .find("div", {"style":"height:480px"}).find("table").find("td").findAll("table")
                # print(news_objs)
                # return
                for news_obj in news_objs:
                    try:
                        news = News()
                        news.date = news_obj.find("span", {"class": "hui12"})
                        news.href = url_domain_pboc + news_obj.find(
                            "a").attrs['href']
                        news.title = news_obj.find("a").text
                        news.content = getget_content(news.href)
                        r = [news.title, news.href, news.date, news.content]
                        csvwriter.writerow(r)
                    except:
                        print("except..")
Esempio n. 8
0
def crawler_FRB():
    html = urlopen(url_frb_2016)
    bsObj = BeautifulSoup(html, "html.parser")
    events_list_obj = bsObj.find("div", {
        "class": "row eventlist"
    }).find("div", {"class": "col-xs-12 col-sm-8 col-md-8"})
    event_rows_obj = events_list_obj.findAll("div", {"class": "row"})

    # news_list = list()

    with open(base_dir + "csv_frb.csv", "a") as fw:
        csvwriter = csv.writer(fw)
        csvwriter.writerow(["title", "href", "date", "type", "content"])
        for event_row_obj in event_rows_obj:
            try:
                news = News()
                date_obj = event_row_obj.find(
                    "div", {"class": "col-xs-3 col-md-2 eventlist__time"})
                news.date = date_obj.find("time").text
                event_obj = event_row_obj.find(
                    "div", {"class": "col-xs-9 col-md-10 eventlist__event"})
                news.href = url_domain_frb + event_obj.find("a").attrs['href']
                news.title = event_obj.find("p").find("a").find("em").text
                news.type = event_obj.find("p", {
                    "class": "eventlist__press"
                }).find("em").find("strong").text
                news.content = get_content(news.href)
                r = [news.title, news.href, news.date, news.type, news.content]
                csvwriter.writerow(r)
                # news_list.append(news)
            except:
                print("except..")
Esempio n. 9
0
def fetch_news(category):
    news_list = []
    for i in range(0, 2):
        resp = r.get(data_base_url,
                     params={
                         "cre": "tianyi",
                         "mod": category,
                         "_": int(round(time.time() * 1000)),
                         "offset": 20 * i
                     },
                     headers=headers)
        print('爬取:', resp.url)
        if resp is not None:
            resp_json = resp.json()
            data = resp_json['data']
            for d in data:
                news_list.append(
                    News(_id=d['uuid'],
                         title=d['title'],
                         overview=d['intro'],
                         image=d['thumb'],
                         publish_time=d['ctime'],
                         origin=d['author'],
                         url=d['url_https']).to_dict())
        time.sleep(random.randint(0, 2))
    return news_list
Esempio n. 10
0
def fetch_gd_news():
    news_list = []
    xhs_headers['Host'] = xhs_gd_host
    resp = r.get(xhs_gd_url, headers=xhs_headers)
    resp.encoding = 'utf-8'
    bs = BeautifulSoup(resp.text, 'lxml')
    data_list = bs.find("ul", attrs={'class': 'gallery l-list-selected l-m'})
    lis = data_list.findAll('li')
    for li in lis:
        l_cbox = li.find('div', attrs={'class': 'l-cbox'})
        spans = l_cbox.find('div', attrs={
            'class': 'l-foot-par'
        }).findAll('span')
        news_id_result = xhs_news_id_pattern.search(li.a['href'])
        if news_id_result is not None:
            # 判断新闻的发布时间与当前时间的时间间隔,只保存12个小时以内的新闻
            publish_time = spans[1].text.replace('\n', '').strip()
            if int(round(time.time())) - int(
                    time.mktime(
                        time.strptime(publish_time,
                                      "%Y-%m-%d %H:%M:%S"))) < 43200:
                news_list.append(
                    News(_id=news_id_result.group(1),
                         url=li.a['href'],
                         title=li.a.img['alt'],
                         image=xhs_gd_url + li.a.img['src'],
                         origin=spans[0].text,
                         publish_time=publish_time,
                         overview=l_cbox.p.text).to_dict())
    return news_list
Esempio n. 11
0
def fetch_diyicaijing_news():
    news_list = []
    resp = r.get(diyicaijing_url,
                 params={'page': 2},
                 headers=diyicaijing_headers)
    bs = BeautifulSoup(resp.text, 'lxml')
    articles = bs.findAll('article', attrs={'class': 'article-item clearfix'})
    for article in articles:
        detail_url = diyicaijing_url[:-1] + article.a['href']
        if not detail_url.endswith('subscribe'):
            news_content = article.div.text.replace(' ', '').replace('\n', '')
            text_result = msg_extract_pattern.findall(news_content)
            if text_result is not None:
                for content in text_result:
                    news_list.append(
                        News(
                            _id=detail_url.split('/')[-1],
                            url=detail_url,
                            image=url_extract_pattern.search(
                                article.a['style']).group(1),
                            origin=content[0],
                            title=content[1],
                            publish_time=content[2],
                        ).to_dict())
    return news_list
Esempio n. 12
0
def scrap_news_company(comp):
    num = comp.stock
    global count_fail, count_suc
    url = news_url + str(num).zfill(5)
    html = scrap_html(url)
    response_soup = BeautifulSoup(html, 'html.parser')
    list_node = response_soup.find('div', class_='ulList02')
    stamp_now = datetime.now().timestamp()

    if list_node:
        # print("get stock:", num)

        h1 = response_soup.find("h1", class_="tf")
        if h1:
            comp.name = h1.get("title")

        up = response_soup.find("div", class_="div002 up")
        if not up:
            up = response_soup.find("div", class_="div002 down")
        if up:
            spans = up.find_all("span")
            if spans:
                lens = len(spans)
                value =   spans[lens-1].text
                comp.up =value

        list = list_node.find_all("li")

        count_suc += 1
        count_hot = 0
        hot_news = []
        comp.ishot = len(list) > 3

        for li in list:
            if not li.find("a"):
                continue
            if not li.find("div", class_="bar01"):
                continue

            txt = li.find("a").text;
            link = li.find("a").get("href")
            date = li.find("div", class_="bar01").text

            date = date.split(":").pop()

            cdate = datetime.strptime(date, "%Y-%m-%d %H:%M")

            # print("== %s=== %s=====+++" % (txt, cdate))
            stamp_new = cdate.timestamp()

            if stamp_now - stamp_new < 24 * 60 * 60 * 2:
                n = News(txt, date, link)
                hot_news.append(n)

        # print("finished get stock: %s ;hot new:%d" % (num, len(hot_news)))
        return hot_news

    else:
        print("error happend", num)
        count_fail = count_fail + 1
Esempio n. 13
0
 def news(self):
     """
     """
     if self._news is None:
         from news import News
         self._news = News(self)
     return self._news
Esempio n. 14
0
def fetch_iheima_news():
    page = 1
    news_list = []
    while True:
        resp = r.get(iheima_url,
                     params={
                         'page': page,
                         'pagesize': 20
                     },
                     headers=iheima_headers)
        print("爬取:", resp.url)
        if resp is not None:
            resp_json = resp.json()
            contents = resp_json['contents']
            for content in contents:
                # 只抓取12个小时以内的新闻
                if int(round(time.time())) - int(
                        time.mktime(
                            time.strptime(content['published'],
                                          "%Y-%m-%d %H:%M"))) > 86400:
                    return news_list
                else:
                    news_list.append(
                        News(_id=content['contentid'],
                             title=content['title'],
                             url=iheima_url[:-1] + content['url'],
                             image=content['thumb'],
                             publish_time=content['published'],
                             origin=content['author'],
                             overview=content['description']).to_dict())
            page += 1
Esempio n. 15
0
def fetch_news(page):
    news_list = []
    resp = r.get(ajax_url,
                 params={
                     'm': 'lists',
                     'a': 'ajaxNews',
                     'cid': 4,
                     'page': page
                 },
                 headers=headers)
    print('爬取:', resp.url)
    if resp is not None:
        resp.encoding = 'utf8'
        rst = json.loads(resp.text[1:-1])['rst']
        pq = PyQuery(rst)
        news_item = pq('div.item-news')
        for item in news_item.items():
            a_url = item('div > p > a').attr('href')
            item_main = title_extract_pattern.search(
                item('div.item-main').text())
            if item_main is not None:
                news_list.append(
                    News(_id=a_url.split('/')[-1].replace('.html', ''),
                         url=a_url,
                         title=item_main.group(1),
                         overview=item_main.group(2),
                         publish_time=item('div.item-date').text()).to_dict())
    return news_list
Esempio n. 16
0
def fetch_web_news_more(start_id):
    global data_list
    headers['Referer'] = web_news_url
    resp = r.get(load_more_base_url,
                 params={
                     'type': 'web_latest_article',
                     'b_id': start_id,
                     'per_page': 30
                 },
                 headers=headers)
    print("抓取:", resp.url)
    if resp is not None:
        resp_json = resp.json()
        items = resp_json['data']['items']
        for item in items:
            post = item['post']
            motifs = post['motifs']
            motifs_name = motifs[0]['name'] if motifs is not None else ''
            data_list.append(
                News(_id=str(item['id']),
                     title=post['title'],
                     url=news_detail_base_url + str(post['id']),
                     image=post['cover'],
                     publish_time=post['published_at'],
                     overview=post['summary'],
                     origin=post['user']['name'] + '|' +
                     motifs_name).to_dict())
        if int(round(time.time())) - int(
                time.mktime(
                    time.strptime(items[-1]['post']['published_at'],
                                  "%Y-%m-%d %H:%M:%S"))) > 86400:
            return None
        else:
            return fetch_web_news_more(items[-1]['id'])
Esempio n. 17
0
def fetch_more_news(min_id):
    news_list = []
    sort_field = ''
    resp = r.get(load_more_url,
                 params={
                     '_render': '',
                     'min_id': min_id,
                     '_': count_time
                 },
                 headers=headers)
    print("爬取:", resp.url)
    if resp is not None:
        data_result = more_data_extract_pattern.search(resp.text)
        if data_result is not None:
            data_json = data_result.group(1)
            data_dict = json.loads(data_json)
            for data in data_dict['data']['list']:
                news_list.append(
                    News(_id=data['id'],
                         title=data['title'],
                         overview=data['brief'],
                         image=data['thumb'],
                         publish_time=data['time'],
                         url=data['url'],
                         origin=data['columnName']).to_dict())
                sort_field = data['sort_field']
    return news_list, sort_field
Esempio n. 18
0
def spider2(startDate, endDate):
    startDateArray = startDate.split("-")
    endDateArray = endDate.split("-")
    #要查询的年份
    years = list(set((startDateArray[0], endDateArray[0])))
    # 起始页面
    start_urls = [
        "https://www.ids.ac.uk/news-and-opinion/news/?select-year%5B0%5D={}&hidden-current-page=1&hidden-sort-by=ndate&current-page=1#listing"
        .format(years[0])
    ]
    if len(years) > 1:
        start_urls.append(
            "https://www.ids.ac.uk/news-and-opinion/news/?select-year%5B0%5D={}&hidden-current-page=1&hidden-sort-by=ndate&current-page=1#listing"
            .format(years[1]))
    # 结果
    results = []
    for start_url in start_urls:
        driver.get(start_url)
        time.sleep(0.5)
        #日期不满足要求
        dateFlag = False
        while True:
            html = etree.HTML(driver.page_source)
            dates = dateConver(
                html.xpath(
                    "//article[@class='c-content-item c-content-item--news c-listing__item']//p[@class='c-content-item__date ts-caption']/text()"
                ))
            links = html.xpath(
                "//article[@class='c-content-item c-content-item--news c-listing__item']//a/@href"
            )
            titles = html.xpath(
                "//article[@class='c-content-item c-content-item--news c-listing__item']//a/text()"
            )
            summarys = html.xpath(
                "//article[@class='c-content-item c-content-item--news c-listing__item']//p[@class='c-content-item__description ts-body ts-body--small']/text()"
            )
            for i in range(len(links)):
                #日期在范围内
                if checkDateRange(startDate, endDate, dates[i]):
                    news = News(dates[i], links[i], titles[i].strip(),
                                summarys[i].strip(), "")
                    print(news.date)
                    results.append(news)
                elif dates[i] < startDate:
                    dateFlag = True
                    break
            #日期不满足要求
            if dateFlag:
                break
            # 下一页DOM
            try:
                next_page_btn = driver.find_element_by_xpath(
                    "//a[@title='Next page']")
            except:
                #最后一页
                break
            driver.execute_script("arguments[0].click();", next_page_btn)
            time.sleep(5)
    return results
Esempio n. 19
0
def welcome():
    if request.wants_json():
        links = navs
        links.append(build_link('/', 'self', 'application/json'))
        root = {'version': '0.1', 'title': 'VT Bash', 'links': links}
        return jsonify(root, 'application/json')
    news = News()
    return render_template('index.html', nav=navs, news=news.news)
Esempio n. 20
0
def testNews():
    # Try to add test url
    fetchWeb.test_parse_url()

    news = News()
    testurl = 'http://www.appledaily.com.tw/realtimenews/article/new/20150822/675760/'
    result = news.loadfromdb(testurl)
    return "Result: " + str(news)
 def news_func(self):
     self.speak('Opening News.')
     from news import News
     self.news_win = News()
     self.news_win.show()
     self.speak(
         'Welcome to News.\nThese are the latest international headlines according to BBC News Network.'
     )
Esempio n. 22
0
    def _test_get_from_source(self, source, count):
        articles = News().get_from_source(source, count)
        self.assertEqual(count, len(articles), "Result length is correct")

        for article in articles:
            self.assertIsNotNone(article.url, "Article url is not None")
            self.assertIsNotNone(article.title, "Article title is not None")
            self.assertIsNotNone(article.snippet,
                                 "Article snippet is not None")
Esempio n. 23
0
def send_news(message):
    news = News()
    news.find_supermain()
    msg = f"<b>Main News on <a href=\"https://www.zakon.kz/\">zakon.kz</a> for {today_modified}</b>\n\n"
    msg += f'<a href="{news.url}">{news.title}</a>\n\n'
    for i in range(4):
        news.find_main(i)
        msg += f"<a href=\"{news.url}\">{news.title}</a>\n\n"
    bot.send_message(message.chat.id, msg, parse_mode='HTML')
Esempio n. 24
0
 def _parse(self, file_path: str) -> News:
     with open(file_path) as f:
         lines = [line.replace(' ', '').strip() for line in f if line is not None]
     url = lines[0]
     date_time = lines[1]
     title = lines[2]
     content = ''.join(lines[3:])
     label = file_path.split('/')[1]
     return News(url=url, date_time=date_time, title=title, content=content, label=label)
Esempio n. 25
0
def test_to_dict():
    sample = News("title", "description", "published", "url", "full_text")
    assert sample.to_dict() == {
        "title": "title",
        "description": "description",
        "url": "url",
        "published": "published",
        "full_text": "full_text",
    }
Esempio n. 26
0
def news():

    response = request.json['details']
    news_objects = []
    for news_piece in response:
        news_objects.append(News(news_piece))

    audio_file = text_to_speech(request.json['message'])
    return render_template('news.html', title='News', news=news_objects, audio_file = audio_file)
Esempio n. 27
0
def anylisor(resp):
    timecounter.updateprogress()
    if not resp:
        return
    response_soup = BeautifulSoup(resp, 'html.parser')
    list_node = response_soup.find('div', class_='ulList02')
    if list_node:
        stamp_now = datetime.now().timestamp()
        comp = Company(stock="")
        h1 = response_soup.find("h1", class_="tf")
        if h1:
            comp.name = h1.get("title")
            compnents = h1.text.split(".")
            if len(compnents) > 0:
                stock_num = compnents[0]
                comp.stock = stock_num
        else:
            return None

        up = response_soup.find("div", class_="div002 up")
        if not up:
            up = response_soup.find("div", class_="div002 down")
        if up:
            spans = up.find_all("span")
            if spans:
                lens = len(spans)
                value = spans[lens - 1].text
                comp.up = value

        list = list_node.find_all("li")

        count_hot = 0
        hot_news = []
        comp.ishot = len(list) > 3

        for li in list:
            if not li.find("a"):
                continue
            if not li.find("div", class_="bar01"):
                continue

            txt = li.find("a").text
            link = li.find("a").get("href")
            date = li.find("div", class_="bar01").text

            date = date.split(":").pop()

            cdate = datetime.strptime(date, "%Y-%m-%d %H:%M")

            stamp_new = cdate.timestamp()

            if stamp_now - stamp_new < 24 * 60 * 60 * 2:
                n = News(txt, date, link)
                hot_news.append(n)
        comp.news = hot_news
        return comp
Esempio n. 28
0
 def prep_news_data(self):
     if not self.news_market_data:
         print 'Preparing news and stock data...\n'
         news = News('Resources/articles.db')
         raw = news.db_articles()
         train_raw, test_raw = divide_list_by_ratio(raw) # prep_news_data returns a tuple of vectors, labels
         self.train_vecs, self.train_labs = self.prep_news_articles(train_raw, fit=True)
         self.test_vecs, self.test_labs = self.prep_news_articles(test_raw)
         self.news_market_data = True
         self.movie_review_data = False
Esempio n. 29
0
    def parse_user(self):
        with codecs.open(self.path, 'r', 'utf-8-sig') as lines:
            for lin in lines:

                lin = lin.strip().split()
                userid, newsid, scan_time, title, create_time = int(
                    lin[0]), int(lin[1]), lin[2], lin[3], lin[-1]
                news = News(int(userid), int(newsid), title, scan_time, [],
                            create_time)
                self.AllNews.append(news)
Esempio n. 30
0
def parse_single_url(url):
    content = urllib2.urlopen(url).read()
    if "該則即時新聞不存在" in content:
        return False
    else:
        soup = BeautifulSoup(
            content,
            from_encoding='utf-8',
        )
        title = str(soup.find("h1", {"id": "h1"}).string)
        contents = soup.find("p", {"id": "summary"})
        while "</iframe>" in contents.renderContents():
            if contents.iframe.decompose() == None:
                break
        desc_contents = contents.renderContents()
        popularity_data = soup.find("a",
                                    attrs={"class": "function_icon clicked"})
        if popularity_data == None:
            popularity = 0
        else:
            popularity = parse_string_to_popularity(popularity_data.string)
        news_datetime = parse_string_to_datetime(soup.find("time").string)
        news_url = soup.find("meta", {"property": "og:url"})['content']
        news_source = soup.find("meta",
                                {"property": "og:site_name"})['content']
        img_url1 = soup.find("a", attrs={"class": "t1"})
        img_url2 = soup.find("figure", attrs={"class": "lbimg sgimg sglft"})

        if img_url1 != None:
            img_url = img_url1.img['src']
        elif img_url2 != None:
            img_url = img_url2.a.img['src']
        else:
            img_url = ""

        logging.debug("news_url: " + str(news_url))
        logging.debug("title: " + str(title))
        logging.debug("content: " + str(desc_contents))
        logging.debug("popularity: " + str(popularity))
        logging.debug("news_datetime: " + str(news_datetime))
        logging.debug("news_first_image_url: " + str(img_url))
        logging.debug("news_source: " + str(news_source))

        news = News(news_url=news_url,
                    title=title,
                    content=desc_contents,
                    popularity=popularity,
                    news_datetime=news_datetime,
                    news_first_image_url=img_url,
                    news_source=news_source)

        logging.info("Add news: " + str(news))
        news.writetodb()
        return True