Example #1
0
def dump_clusters():

    args = get_args()
    if args['-train'] == '':
        args['-train'] = 'src/resources/output' + args['-k']
    w2vobj = W2V(args['-input'], args['-train'], args['-k'])

    news = News()
    articles = news.get_articles()
    w2vobj.train()
    # Sentence vectorization by averaging
    article_vecs = [w2vobj.get_sentence_vector_avg(article['cleaned_title']) for article in articles]

    # Sentence vectorization by "newtonian" method
    '''article_vecs = []
    for article in articles:
        newtonian_vec = w2vobj.get_sentence_vector_newtonian(article['cleaned_title'])
        if newtonian_vec is not None:
            article_vecs.append(newtonian_vec)'''

    cluster_obj = Clustering(article_vecs, w2vobj)
    r_conn = redis.from_url(os.getenv('REDIS_URL',"redis://localhost:6379/"))

    if args['-cluster'] == 'agg':
        if args['-prune'] == 'true' or args['-prune'] == 'True':
            utilities.redis_kmeans_clusters(cluster_obj, articles, True, int(args['-limit']), r_conn)
            print("redis dump complete")
        else:
            utilities.redis_kmeans_clusters(cluster_obj, articles, False, int(args['-limit']), r_conn)
            print("redis dump complete")
    else:
        #TODO dump to redis
        utilties.print_ann_clusters(cluster_obj, articles)
Example #2
0
class FullscreenWindow:
    def __init__(self):
        self.tk = Tk()
        self.tk.attributes("-fullscreen", True)
        self.tk.configure(background='black')
        self.topFrame = Frame(self.tk, background='black')
        self.bottomFrame = Frame(self.tk, background='black')
        self.topFrame.pack(side=TOP, fill=BOTH, expand=YES)
        self.bottomFrame.pack(side=BOTTOM, fill=BOTH, expand=YES)
        self.state = False
        self.tk.bind("<Return>", self.toggle_fullscreen)
        self.tk.bind("<Escape>", self.end_fullscreen)
        # clock
        self.clock = Clock(self.topFrame)
        self.clock.pack(side=RIGHT, anchor=N, padx=100, pady=60)
        # weather
        self.weather = Weather(self.topFrame)
        self.weather.pack(side=LEFT, anchor=N, pady=60, padx=100)
        # news
        self.news = News(self.bottomFrame)
        self.news.pack(side=LEFT, anchor=S, padx=100, pady=60)
        # goodmorning
        self.Goodmoring = Goodmorning(self.topFrame, background='black')
        self.Goodmoring.pack(side=BOTTOM, anchor=W)

    def toggle_fullscreen(self, event=None):
        self.state = not self.state  # Just toggling the boolean
        self.tk.attributes("-fullscreen", self.state)
        return "break"

    def end_fullscreen(self, event=None):
        self.state = False
        self.tk.attributes("-fullscreen", self.state)
        return "break"
def test_news_perm():
    n1 = News("asdf", 0)
    n2 = News("asdf_23", 0)
    n3 = News("asdf_", 1)
    n4 = News("asdf_23", 1)

    nv1 = NewsVector()
    nv1.add(n1)
    nv1.add(n2)
    nv1.label = n1.label
    nv2 = NewsVector()
    nv2.add(n3)
    nv2.add(n4)
    nv2.label = n3.label

    news_vecs = [[nv1, nv1], [nv2, nv2]] * 10
    num_agents = 2

    X, y, union = helpers.get_feature_vectors(news_vecs, num_agents)
    classifier, y_pred, y_true = ml.train_and_test(X, y, verbose=True)

    test_accuracy = (y_pred == y_true).sum() / sum(map(len, y_pred))
    print(f"Test acc: {test_accuracy}")
    test_stat = correctly_classified
    p_value = permutation_test.blocked_sampled_test(y_pred, y_true, test_stat)
    return p_value
Example #4
0
def index_page():
    #form=search_item()
    #t=form.keyword.data
    #print(t,"hellp")
    ABC = News.Hotlines(News, 'trump', 'abc-news', 'en')
    # ABC = ABC['articles'][0]
    BBC = News.Hotlines(News, 'trump', 'bbc-news,the-verge', 'en')
    #BBC=BBC['articles'][0]
    CBS = News.Hotlines(News, 'trump', 'cbs-news', 'en')
    #CBS = CBS['articles'][0]
    CNN = News.Hotlines(News, 'trump', 'CNN', 'en')
    #CNN=CNN['articles'][0]
    ESPN = News.Hotlines(News, 'trump', 'espn', 'en')
    #ESPN = ESPN['articles'][0]
    FOX = News.Hotlines(News, 'trump', 'fox-news', 'en')
    #FOX=  FOX['articles'][0]
    NBC = News.Hotlines(News, 'trump', 'nbc-news', 'en')
    #NBC = NBC['articles'][0]
    NYM = News.Hotlines(News, 'trump', 'new-york-magazine', 'en')
    #NYM = NYM['articles'][0]
    WSJ = News.Hotlines(News, 'trump', 'the-wall-street-journal', 'en')
    #WSJ = WSJ['articles'][0]
    TIME = News.Hotlines(News, 'trump', 'Time', 'en')
    #TIME=TIME['articles'][0]
    return render_template('index.html',
                           ABC=ABC,
                           BBC=BBC,
                           CBS=CBS,
                           CNN=CNN,
                           ESPN=ESPN,
                           FOX=FOX,
                           NBC=NBC,
                           NYM=NYM,
                           WSJ=WSJ,
                           TIME=TIME)
def news_test(companies):
    t = News(
        companies,
        output_root=r'C:\Users\zleirdahl\Desktop\PythonScripts\iex\Data\News\\',
        header_fields=['Date', 'Headline', 'Source', 'URL', 'Summary'],
        file_suffix='news')
    t.run()
Example #6
0
def comment_page():
    coms = Comments(app.config['dsn'])
    nes = News(app.config['dsn'])
    if request.method == 'GET':
        now = datetime.datetime.now()
        comlist = coms.get_commentlist()
        nelist = nes.get_newlist()
        return render_template('comments.html', CommentList = comlist, NewList = nelist, current_time=now.ctime())
    elif 'comments_to_delete' in request.form:
        id_comments = request.form.getlist('comments_to_delete')
        for id_comment in id_comments:
            coms.delete_comment(id_comment)
        return redirect(url_for('comment_page'))
    elif 'comments_to_add' in request.form:
        id_comments = request.form.getlist('comments_to_add')
        for id_comment in id_comments:
            coms.add_comment(request.form['name'],request.form['article'],id_comment)
        return redirect(url_for('comment_page'))
    elif 'comments_to_update' in request.form:
        coms.update_comment(request.form['id_comment'], request.form['name'],request.form['article'])
        return redirect(url_for('comment_page'))
    elif 'comments_to_search' in request.form:
            searchList = coms.search_comment(request.form['name']);
            now = datetime.datetime.now()
            comlist = coms.get_commentlist()
            nelist = nes.get_newlist()
            return render_template('comments.html', CommentList = comlist, NewList = nelist, SearchList = searchList, current_time=now.ctime())
Example #7
0
def fetch_penpai_news():
    news_list = []  # 新闻列表
    # 提取首页的新闻数据
    index_resp = r.get(penpai_url).text
    index_html = etree.HTML(index_resp)
    news_urls = index_html.xpath(
        '//div[@class="news_li"]/div[@class="news_tu"]/a')  # 新闻链接
    imgs_urls = index_html.xpath(
        '//div[@class="news_li"]/div[@class="news_tu"]/a/img')  # 新闻图片
    overviews = index_html.xpath('//div[@class="news_li"]/p')  # 新闻简介
    times = index_html.xpath('//div[@class="pdtt_trbs"]/span[1]')  # 新闻时间
    origins = index_html.xpath('//div[@class="pdtt_trbs"]/a')  # 新闻来源
    for i in range(0, int(len(news_urls) / 2)):
        news_list.append(
            News(_id=news_urls[i].get('href').split('_')[-1],
                 title=imgs_urls[i].get('alt'),
                 overview=overviews[i].text.replace('\n', '').replace(' ', ''),
                 url=penpai_url + news_urls[i].get('href'),
                 image='http:' + imgs_urls[i].get('src'),
                 publish_time=times[i].text,
                 origin=origins[i].text).to_dict())
    # 正则提取topCids
    topCids = ''
    ids = cids_pattern.search(index_resp)
    if topCids is not None:
        topCids = ids.group(1)
    # 设置Ajax请求头
    ajax_params = {
        'nodeids': 25949,
        'topCids': '2840959,2840504,2840804,2841177,',
    }
    pageidx = 2
    while True:
        ajax_params['pageidx'] = pageidx
        ajax_params['lastTime'] = int(round(time.time() * 1000))
        resp = r.get(penpai_ajax_url,
                     params=ajax_params,
                     headers=penpai_headers)
        resp_content = resp.text
        print("爬取:", resp.url)
        results = news_pattern.findall(resp_content)
        for result in results:
            if '小时前' in result[5]:
                hours_before = hours_pattern.search(result[5])
                if hours_before is not None:
                    if int(hours_before.group(1)) > 12:
                        return news_list
                    else:
                        news_list.append(
                            News(_id=result[0].split('_')[-1],
                                 title=result[2],
                                 overview=result[3].replace('\n', '').replace(
                                     ' ', ''),
                                 url=penpai_url + result[0],
                                 image='http:' + result[1],
                                 publish_time=result[5],
                                 origin=result[4]).to_dict())
        pageidx += 1
        time.sleep(random.randint(0, 2))
Example #8
0
def testNews():
    # Try to add test url
    fetchWeb.test_parse_url()

    news = News()
    testurl = 'http://www.appledaily.com.tw/realtimenews/article/new/20150822/675760/'
    result = news.loadfromdb(testurl)
    return "Result: " + str(news)
Example #9
0
def testNews():
    # Try to add test url
    fetchWeb.test_parse_url()

    news = News()
    testurl = 'http://www.appledaily.com.tw/realtimenews/article/new/20150822/675760/'
    result = news.loadfromdb(testurl)
    return "Result: " + str(news)
 def news_func(self):
     self.speak('Opening News.')
     from news import News
     self.news_win = News()
     self.news_win.show()
     self.speak(
         'Welcome to News.\nThese are the latest international headlines according to BBC News Network.'
     )
Example #11
0
def test_to_dict():
    sample = News("title", "description", "published", "url", "full_text")
    assert sample.to_dict() == {
        "title": "title",
        "description": "description",
        "url": "url",
        "published": "published",
        "full_text": "full_text",
    }
Example #12
0
 def prep_news_data(self):
     if not self.news_market_data:
         print 'Preparing news and stock data...\n'
         news = News('Resources/articles.db')
         raw = news.db_articles()
         train_raw, test_raw = divide_list_by_ratio(raw) # prep_news_data returns a tuple of vectors, labels
         self.train_vecs, self.train_labs = self.prep_news_articles(train_raw, fit=True)
         self.test_vecs, self.test_labs = self.prep_news_articles(test_raw)
         self.news_market_data = True
         self.movie_review_data = False
Example #13
0
def parse_single_url(url):
    content = urllib2.urlopen(url).read()
    if "該則即時新聞不存在" in content:
        return False
    else:
        soup = BeautifulSoup(
            content,
            from_encoding='utf-8',
        )
        title = str(soup.find("h1", {"id": "h1"}).string)
        contents = soup.find("p", {"id": "summary"})
        while "</iframe>" in contents.renderContents():
            if contents.iframe.decompose() == None:
                break
        desc_contents = contents.renderContents()
        popularity_data = soup.find("a",
                                    attrs={"class": "function_icon clicked"})
        if popularity_data == None:
            popularity = 0
        else:
            popularity = parse_string_to_popularity(popularity_data.string)
        news_datetime = parse_string_to_datetime(soup.find("time").string)
        news_url = soup.find("meta", {"property": "og:url"})['content']
        news_source = soup.find("meta",
                                {"property": "og:site_name"})['content']
        img_url1 = soup.find("a", attrs={"class": "t1"})
        img_url2 = soup.find("figure", attrs={"class": "lbimg sgimg sglft"})

        if img_url1 != None:
            img_url = img_url1.img['src']
        elif img_url2 != None:
            img_url = img_url2.a.img['src']
        else:
            img_url = ""

        logging.debug("news_url: " + str(news_url))
        logging.debug("title: " + str(title))
        logging.debug("content: " + str(desc_contents))
        logging.debug("popularity: " + str(popularity))
        logging.debug("news_datetime: " + str(news_datetime))
        logging.debug("news_first_image_url: " + str(img_url))
        logging.debug("news_source: " + str(news_source))

        news = News(news_url=news_url,
                    title=title,
                    content=desc_contents,
                    popularity=popularity,
                    news_datetime=news_datetime,
                    news_first_image_url=img_url,
                    news_source=news_source)

        logging.info("Add news: " + str(news))
        news.writetodb()
        return True
Example #14
0
def index():
    news = News()
    page = request.args.get('page')
    if page is None:
        page = 1
    data = news.list(page=int(page), limit=20)
    total = news.count()
    if data is None:
        return jsonResponse(type='404')
    for i, item in enumerate(data):
        data[i]['date'] = time.mktime(item['date'].timetuple())
    return jsonResponse(data=data, extra_data=[{'total': total}])
Example #15
0
def send_rate(message):
    n = News()
    n.find_news(news_counter)
    inline_markup = telebot.types.InlineKeyboardMarkup()
    itembtnyes = telebot.types.InlineKeyboardButton('Yes',
                                                    callback_data='news_yes')
    itembtnno = telebot.types.InlineKeyboardButton('No',
                                                   callback_data='news_no')
    inline_markup.row(itembtnyes, itembtnno)
    bot.send_message(message.chat.id,
                     f"{n.title}\n{n.url}\n\nAre you interested in this news?",
                     disable_web_page_preview=True,
                     reply_markup=inline_markup)
Example #16
0
async def on_message(message):
    # we do not want the bot to reply to itself
    if message.author == client.user:
        return

    if message.content.startswith('!hello'):
        msg = 'Hello {0.author.mention}'.format(message)
        await client.send_message(message.channel, msg)

    if message.content.startswith('!top-headlines'):
        x = News(TOKENS['news_api_token'])
        data=x.topNewsHeadlines()
        print(data)
Example #17
0
def call_fractory_news():
    lab = tk.Label(text="Fractory news",padx=5,pady=20)
    lab.grid(row=0,column=0,sticky="W")

    nws = News()  # Instantiating News class from news module
    lst = nws.fractory()


    lst_values = list(lst.values())
    lst_keys = list(lst.keys())
    lks = [i for i in range(10)]

    lks[0] = tk.Label(text=f"1. {lst_keys[0]}", fg="blue", cursor="hand2")
    lks[0].bind('<Button-1>', lambda j: open_link(lst_values[0]))
    lks[0].grid(row=1,column=0,sticky="W")

    lks[1] = tk.Label(text=f"2. {lst_keys[1]}", fg="blue", cursor="hand2")
    lks[1].bind('<Button-1>', lambda j: open_link(lst_values[1]))
    lks[1].grid(row=2,column=0, sticky="W")

    lks[2] = tk.Label(text=f"3. {lst_keys[2]}", fg="blue", cursor="hand2")
    lks[2].bind('<Button-1>', lambda j: open_link(lst_values[2]))
    lks[2].grid(row=3,column=0, sticky="W")

    lks[3] = tk.Label(text=f"4. {lst_keys[3]}", fg="blue", cursor="hand2")
    lks[3].bind('<Button-1>', lambda j: open_link(lst_values[3]))
    lks[3].grid(row=4,column=0, sticky="W")

    lks[4] = tk.Label(text=f"5. {lst_keys[4]}", fg="blue", cursor="hand2")
    lks[4].bind('<Button-1>', lambda j: open_link(lst_values[4]))
    lks[4].grid(row=5,column=0, sticky="W")

    lks[5] = tk.Label(text=f"6. {lst_keys[5]}", fg="blue", cursor="hand2")
    lks[5].bind('<Button-1>', lambda j: open_link(lst_values[5]))
    lks[5].grid(row=6,column=0, sticky="W")

    lks[6] = tk.Label(text=f"7. {lst_keys[6]}", fg="blue", cursor="hand2")
    lks[6].bind('<Button-1>', lambda j: open_link(lst_values[6]))
    lks[6].grid(row=7,column=0, sticky="W")

    lks[7] = tk.Label(text=f"8. {lst_keys[7]}", fg="blue", cursor="hand2")
    lks[7].bind('<Button-1>', lambda j: open_link(lst_values[7]))
    lks[7].grid(row=8,column=0, sticky="W")

    lks[8] = tk.Label(text=f"9. {lst_keys[8]}", fg="blue", cursor="hand2")
    lks[8].bind('<Button-1>', lambda j: open_link(lst_values[8]))
    lks[8].grid(row=9,column=0, sticky="W")

    lks[9] = tk.Label(text=f"10. {lst_keys[9]}", fg="blue", cursor="hand2")
    lks[9].bind('<Button-1>', lambda j: open_link(lst_values[9]))
    lks[9].grid(row=10,column=0, sticky="W")
Example #18
0
def news():
    posts = MyProfile(current_user.username)
    news = News()
    news.print_news(current_user.username)
    if request.method == 'POST':
        id = request.form['news_id_delete']
        try:
            mynew = New(id, None, None, None, None, None, None, None, True,
                        None)
            mynew.delete_new()
        except:
            flash('Could not delete new!')
        return redirect(url_for('news'))
    return render_template("news.html", posts=posts, news=news.news_arr)
Example #19
0
def create_google_news(keyword_input):
    limit = int(input("검색할 뉴스 개수를 입력하세요 : "))
    # 뉴스 데이터 딕셔너리
    news_dict = google_news_clipping(keyword_input, limit)
    # 뉴스 객체 저장 리스트
    news_items = []
    # 뉴스 객체 생성
    for i in range(len(news_dict['title'])):
        news_item = News(news_dict['title'][i], news_dict['link'][i], news_dict['contents'][i], news_dict['agency'][i], news_dict['date'][i], news_dict['time'][i])
        news_items.append(news_item)
    # 크롤링한 기사 출력
    for news_item in news_items:
        news_item.print()
    return news_items
Example #20
0
def call_fabricator_news():
    lab = tk.Label(text="The Fabricator.com news", padx=5, pady=20)
    lab.grid(row=11, column=0, sticky="W")
    nwsf=News()
    lstf=nwsf.fabricator() #Calling fabricator method

    lstf_values = list(lstf.values())   #value list from fabricator.com site*
    lstf_keys = list(lstf.keys())#key list from fabricator.com site
    lksf = [i for i in range(10)]

    lksf[0] = tk.Label(text=f"1. {lstf_keys[0]}", fg="blue", cursor="hand2")
    lksf[0].bind('<Button-1>', lambda j: open_link(lstf_values[0]))
    lksf[0].grid(row=12, column=0, sticky="W")

    lksf[1] = tk.Label(text=f"2. {lstf_keys[1]}", fg="blue", cursor="hand2")
    lksf[1].bind('<Button-1>', lambda j: open_link(lstf_values[1]))
    lksf[1].grid(row=13, column=0, sticky="W")

    lksf[2] = tk.Label(text=f"3. {lstf_keys[2]}", fg="blue", cursor="hand2")
    lksf[2].bind('<Button-1>', lambda j: open_link(lstf_values[2]))
    lksf[2].grid(row=14, column=0, sticky="W")

    lksf[3] = tk.Label(text=f"4. {lstf_keys[3]}", fg="blue", cursor="hand2")
    lksf[3].bind('<Button-1>', lambda j: open_link(lstf_values[3]))
    lksf[3].grid(row=15, column=0, sticky="W")

    lksf[4] = tk.Label(text=f"5. {lstf_keys[4]}", fg="blue", cursor="hand2")
    lksf[4].bind('<Button-1>', lambda j: open_link(lstf_values[1]))
    lksf[4].grid(row=16, column=0, sticky="W")

    lksf[5] = tk.Label(text=f"6. {lstf_keys[1]}", fg="blue", cursor="hand2")
    lksf[5].bind('<Button-1>', lambda j: open_link(lstf_values[5]))
    lksf[5].grid(row=17, column=0, sticky="W")

    lksf[6] = tk.Label(text=f"7. {lstf_keys[6]}", fg="blue", cursor="hand2")
    lksf[6].bind('<Button-1>', lambda j: open_link(lstf_values[6]))
    lksf[6].grid(row=18, column=0, sticky="W")

    lksf[7] = tk.Label(text=f"8. {lstf_keys[7]}", fg="blue", cursor="hand2")
    lksf[7].bind('<Button-1>', lambda j: open_link(lstf_values[7]))
    lksf[7].grid(row=19, column=0, sticky="W")

    lksf[8] = tk.Label(text=f"9. {lstf_keys[8]}", fg="blue", cursor="hand2")
    lksf[8].bind('<Button-1>', lambda j: open_link(lstf_values[8]))
    lksf[8].grid(row=20, column=0, sticky="W")

    lksf[9] = tk.Label(text=f"10. {lstf_keys[9]}", fg="blue", cursor="hand2")
    lksf[9].bind('<Button-1>', lambda j: open_link(lstf_values[9]))
    lksf[9].grid(row=21, column=0, sticky="W")
Example #21
0
def main():
    """
    The main method of the nuncium application will initialize the execution of
    the program. Threads will be used to query for user input. Each window has
    its own thread to manage the update of its own interface.
    """

    # UI object: The user interface of the nuncium application.
    ui = UI()

    # Integer: The height will consist of the entire screen and the width will
    #          consist of approximately 1/5 of the screen's width.
    height = curses.LINES
    width = int(curses.COLS / 5)

    # String: The default news category that is displayed on startup.
    category = "Top Stories"

    # Window object: The window that will render the menu interface.
    menu_window = ui.window(height, width)
    ui.display_menu(menu_window, category, color=curses.COLOR_BLUE)

    # Integer: The starting position in the x-coordinate of the news window will
    #          be rendered where the last window left off. The width of the news
    #          window will consist of the remaining free space.
    x = width
    width = curses.COLS - width

    # Window object: The window that will render the news content.
    news_window = ui.window(height, width, x, y=0)

    # News object: The news aggregator of the nunicum application.
    news = News()
    news.fetch_news(ui, news_window)

    ui.cursor(menu_window, x=1, y=1, y2=1, current="Top Stories")

    # Thread object: A thread used for updating the menu and news content.
    menu_thread = Thread(target=update, args=(menu_window,), daemon=True)
    news_thread = Thread(target=update, args=(news_window,), daemon=True)

    menu_thread.start()
    news_thread.start()

    # Wait for the threads to finish working.
    while running:
        pass

    ui.cleanup()
Example #22
0
def parse_single_url(url):
    content = urllib2.urlopen(url).read()
    if "該則即時新聞不存在" in content:
        return False
    else:
        soup = BeautifulSoup(content, from_encoding='utf-8',)
        title = str(soup.find("h1", {"id":"h1"}).string)
        contents = soup.find("p", {"id":"summary"})
        while "</iframe>" in contents.renderContents():
            if contents.iframe.decompose() == None :
               break
        desc_contents = contents.renderContents()
        popularity_data = soup.find("a", attrs={"class":"function_icon clicked"})
        if popularity_data == None:
            popularity = 0
        else:
            popularity = parse_string_to_popularity(popularity_data.string)
        news_datetime = parse_string_to_datetime(soup.find("time").string)
        news_url = soup.find("meta", {"property":"og:url"})['content']
        news_source = soup.find("meta", {"property":"og:site_name"})['content']
        img_url1 = soup.find("a",attrs={"class":"t1"})
        img_url2 = soup.find("figure", attrs={"class":"lbimg sgimg sglft"})

        if img_url1 != None:
            img_url = img_url1.img['src']
        elif img_url2 != None:
            img_url = img_url2.a.img['src']
        else:
            img_url = ""

        logging.debug("news_url: " + str(news_url))
        logging.debug("title: " + str(title))
        logging.debug("content: " + str(desc_contents))
        logging.debug("popularity: " + str(popularity))
        logging.debug("news_datetime: " + str(news_datetime))
        logging.debug("news_first_image_url: " + str(img_url))
        logging.debug("news_source: " + str(news_source))

        news = News(news_url=news_url,
                    title=title,
                    content=desc_contents,
                    popularity=popularity,
                    news_datetime=news_datetime,
                    news_first_image_url=img_url,
                    news_source=news_source)

        logging.info("Add news: " + str(news))
        news.writetodb()
        return True
Example #23
0
    def __init__(self, parent, *args, **kwargs):
        Frame.__init__(self, parent, bg='black')
        self.title = 'News'
        self.newsHandler = News(parent.currentActiveUser.newsTopic)
        self.newsLabel = Label(self,
                               text=self.title,
                               font=("Helvetica", 28),
                               fg="white",
                               bg="black",
                               justify="left")
        self.newsElementContainer = Frame(self, bg="black")
        self.parent = parent

        self.newsLabel.pack(side=TOP, anchor=W)
        self.newsElementContainer.pack(side=TOP, anchor=E)
Example #24
0
def fetch_news(page):
    news_list = []
    resp = r.get(ajax_url,
                 params={
                     'm': 'lists',
                     'a': 'ajaxNews',
                     'cid': 4,
                     'page': page
                 },
                 headers=headers)
    print('爬取:', resp.url)
    if resp is not None:
        resp.encoding = 'utf8'
        rst = json.loads(resp.text[1:-1])['rst']
        pq = PyQuery(rst)
        news_item = pq('div.item-news')
        for item in news_item.items():
            a_url = item('div > p > a').attr('href')
            item_main = title_extract_pattern.search(
                item('div.item-main').text())
            if item_main is not None:
                news_list.append(
                    News(_id=a_url.split('/')[-1].replace('.html', ''),
                         url=a_url,
                         title=item_main.group(1),
                         overview=item_main.group(2),
                         publish_time=item('div.item-date').text()).to_dict())
    return news_list
Example #25
0
def fetch_iheima_news():
    page = 1
    news_list = []
    while True:
        resp = r.get(iheima_url,
                     params={
                         'page': page,
                         'pagesize': 20
                     },
                     headers=iheima_headers)
        print("爬取:", resp.url)
        if resp is not None:
            resp_json = resp.json()
            contents = resp_json['contents']
            for content in contents:
                # 只抓取12个小时以内的新闻
                if int(round(time.time())) - int(
                        time.mktime(
                            time.strptime(content['published'],
                                          "%Y-%m-%d %H:%M"))) > 86400:
                    return news_list
                else:
                    news_list.append(
                        News(_id=content['contentid'],
                             title=content['title'],
                             url=iheima_url[:-1] + content['url'],
                             image=content['thumb'],
                             publish_time=content['published'],
                             origin=content['author'],
                             overview=content['description']).to_dict())
            page += 1
Example #26
0
def run_bot():
    """run_bot is the main function of the program"""
    configpath = get_abs_path_of_filepath(__file__) + "/config/config.ini"
    userspath = get_abs_path_of_filepath(__file__) + "/config/users.ini"
    newspath = get_abs_path_of_filepath(__file__) + "/config/news.ini"
    bot = Bot(configpath, userspath)
    try:
        dispatcher = bot.init_updater()
        news = News(configpath, newspath, userspath, dispatcher)
        bot.init_handler()
        bot.start_bot()
        news.start_thread()
    except KeyboardInterrupt:
        news.stopflag = True
    except Exception as exc:
        Log.LOGGING.error("Unhandled Exception: {}".format(exc))
Example #27
0
def test_news_from_feed():
    test_feed = {
        "title":
        "American dies of coronavirus in China; five Britons infected in French Alps",
        "summary":
        'A 60-year-old American has died of the new coronavirus, the first confirmed non-Chinese death of the illness, U.S. officials said, as millions of Chinese began returning home after a Lunar New Year break that was extended to try to contain the outbreak.<div class="feedflare">\n<a href="http://feeds.reuters.com/~ff/reuters/topNews?a=OmyoX_6P_ok:xVts69RBOpQ:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/reuters/topNews?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.reuters.com/~ff/reuters/topNews?a=OmyoX_6P_ok:xVts69RBOpQ:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/reuters/topNews?i=OmyoX_6P_ok:xVts69RBOpQ:V_sGLiPBpWU" border="0"></img></a> <a href="http://feeds.reuters.com/~ff/reuters/topNews?a=OmyoX_6P_ok:xVts69RBOpQ:-BTjWOF_DHI"><img src="http://feeds.feedburner.com/~ff/reuters/topNews?i=OmyoX_6P_ok:xVts69RBOpQ:-BTjWOF_DHI" border="0"></img></a>\n</div><img src="http://feeds.feedburner.com/~r/reuters/topNews/~4/OmyoX_6P_ok" height="1" width="1" alt=""/>',
        "link":
        "http://feeds.reuters.com/~r/reuters/topNews/~3/OmyoX_6P_ok/american-dies-of-coronavirus-in-china-five-britons-infected-in-french-alps-idUSKBN20003J",
        "published": "Sat, 08 Feb 2020 12:22:37 -0500",
    }

    item_from_feed = News.from_feed(test_feed)
    assert (
        item_from_feed.title ==
        "American dies of coronavirus in China; five Britons infected in French Alps"
    )
    assert (
        item_from_feed.description ==
        "A 60-year-old American has died of the new coronavirus, the first confirmed non-Chinese death of the illness, U.S. officials said, as millions of Chinese began returning home after a Lunar New Year break that was extended to try to contain the outbreak."
    )
    assert (
        item_from_feed.url ==
        "http://feeds.reuters.com/~r/reuters/topNews/~3/OmyoX_6P_ok/american-dies-of-coronavirus-in-china-five-britons-infected-in-french-alps-idUSKBN20003J"
    )
    assert item_from_feed.published == datetime.datetime(2020,
                                                         2,
                                                         8,
                                                         17,
                                                         22,
                                                         37,
                                                         tzinfo=pytz.utc)
Example #28
0
 def news(self):
     """
     """
     if self._news is None:
         from news import News
         self._news = News(self)
     return self._news
Example #29
0
def fetch_news(category):
    news_list = []
    for i in range(0, 2):
        resp = r.get(data_base_url,
                     params={
                         "cre": "tianyi",
                         "mod": category,
                         "_": int(round(time.time() * 1000)),
                         "offset": 20 * i
                     },
                     headers=headers)
        print('爬取:', resp.url)
        if resp is not None:
            resp_json = resp.json()
            data = resp_json['data']
            for d in data:
                news_list.append(
                    News(_id=d['uuid'],
                         title=d['title'],
                         overview=d['intro'],
                         image=d['thumb'],
                         publish_time=d['ctime'],
                         origin=d['author'],
                         url=d['url_https']).to_dict())
        time.sleep(random.randint(0, 2))
    return news_list
Example #30
0
class Newsletter(Frame):
    def __init__(self, parent, *args, **kwargs):
        Frame.__init__(self, parent, bg='black')
        self.title = 'News'
        self.newsHandler = News(parent.currentActiveUser.newsTopic)
        self.newsLabel = Label(self,
                               text=self.title,
                               font=("Helvetica", 28),
                               fg="white",
                               bg="black",
                               justify="left")
        self.newsElementContainer = Frame(self, bg="black")
        self.parent = parent

        self.newsLabel.pack(side=TOP, anchor=W)
        self.newsElementContainer.pack(side=TOP, anchor=E)

    def get_news(self):
        for widget in self.newsElementContainer.winfo_children():
            widget.destroy()

        for newsTitle in self.newsHandler.get_news():
            news_element = NewsElement(self.newsElementContainer,
                                       news_title=newsTitle)
            news_element.pack(side=TOP, anchor=W)
Example #31
0
def fetch_gd_news():
    news_list = []
    xhs_headers['Host'] = xhs_gd_host
    resp = r.get(xhs_gd_url, headers=xhs_headers)
    resp.encoding = 'utf-8'
    bs = BeautifulSoup(resp.text, 'lxml')
    data_list = bs.find("ul", attrs={'class': 'gallery l-list-selected l-m'})
    lis = data_list.findAll('li')
    for li in lis:
        l_cbox = li.find('div', attrs={'class': 'l-cbox'})
        spans = l_cbox.find('div', attrs={
            'class': 'l-foot-par'
        }).findAll('span')
        news_id_result = xhs_news_id_pattern.search(li.a['href'])
        if news_id_result is not None:
            # 判断新闻的发布时间与当前时间的时间间隔,只保存12个小时以内的新闻
            publish_time = spans[1].text.replace('\n', '').strip()
            if int(round(time.time())) - int(
                    time.mktime(
                        time.strptime(publish_time,
                                      "%Y-%m-%d %H:%M:%S"))) < 43200:
                news_list.append(
                    News(_id=news_id_result.group(1),
                         url=li.a['href'],
                         title=li.a.img['alt'],
                         image=xhs_gd_url + li.a.img['src'],
                         origin=spans[0].text,
                         publish_time=publish_time,
                         overview=l_cbox.p.text).to_dict())
    return news_list
Example #32
0
    def get_domain_news(self, domain, now_time):
        '''
            domain: str, the domain of source (e.g. 'bbc.com')
            now_time: Datetime, the current datetime in iso form

            return:
                domain_news_result: list<News> The news from the last query until now
        '''
        overall_count, first_page_content = self.get_domain_news_count(
            domain, now_time)

        domain_result = []
        domain_result.extend(first_page_content)
        page_number = math.ceil(overall_count /
                                20)  #Get the total number of pages

        if page_number >= 2:
            for page in range(2, min(page_number + 1, 5)):
                page_result = self.get_domain_news_at_page_n(
                    domain, now_time, page)
                domain_result.extend(page_result)
        domain_news_result = []
        for r in domain_result:
            news_r = News(domain, r["title"], r["description"],
                          r["publishedAt"], r["urlToImage"], r["url"])
            domain_news_result.append(news_r)

        return domain_news_result
Example #33
0
def fetch_web_news_more(start_id):
    global data_list
    headers['Referer'] = web_news_url
    resp = r.get(load_more_base_url,
                 params={
                     'type': 'web_latest_article',
                     'b_id': start_id,
                     'per_page': 30
                 },
                 headers=headers)
    print("抓取:", resp.url)
    if resp is not None:
        resp_json = resp.json()
        items = resp_json['data']['items']
        for item in items:
            post = item['post']
            motifs = post['motifs']
            motifs_name = motifs[0]['name'] if motifs is not None else ''
            data_list.append(
                News(_id=str(item['id']),
                     title=post['title'],
                     url=news_detail_base_url + str(post['id']),
                     image=post['cover'],
                     publish_time=post['published_at'],
                     overview=post['summary'],
                     origin=post['user']['name'] + '|' +
                     motifs_name).to_dict())
        if int(round(time.time())) - int(
                time.mktime(
                    time.strptime(items[-1]['post']['published_at'],
                                  "%Y-%m-%d %H:%M:%S"))) > 86400:
            return None
        else:
            return fetch_web_news_more(items[-1]['id'])
Example #34
0
def fetch_diyicaijing_news():
    news_list = []
    resp = r.get(diyicaijing_url,
                 params={'page': 2},
                 headers=diyicaijing_headers)
    bs = BeautifulSoup(resp.text, 'lxml')
    articles = bs.findAll('article', attrs={'class': 'article-item clearfix'})
    for article in articles:
        detail_url = diyicaijing_url[:-1] + article.a['href']
        if not detail_url.endswith('subscribe'):
            news_content = article.div.text.replace(' ', '').replace('\n', '')
            text_result = msg_extract_pattern.findall(news_content)
            if text_result is not None:
                for content in text_result:
                    news_list.append(
                        News(
                            _id=detail_url.split('/')[-1],
                            url=detail_url,
                            image=url_extract_pattern.search(
                                article.a['style']).group(1),
                            origin=content[0],
                            title=content[1],
                            publish_time=content[2],
                        ).to_dict())
    return news_list
Example #35
0
def scrap_news_company(comp):
    num = comp.stock
    global count_fail, count_suc
    url = news_url + str(num).zfill(5)
    html = scrap_html(url)
    response_soup = BeautifulSoup(html, 'html.parser')
    list_node = response_soup.find('div', class_='ulList02')
    stamp_now = datetime.now().timestamp()

    if list_node:
        # print("get stock:", num)

        h1 = response_soup.find("h1", class_="tf")
        if h1:
            comp.name = h1.get("title")

        up = response_soup.find("div", class_="div002 up")
        if not up:
            up = response_soup.find("div", class_="div002 down")
        if up:
            spans = up.find_all("span")
            if spans:
                lens = len(spans)
                value =   spans[lens-1].text
                comp.up =value

        list = list_node.find_all("li")

        count_suc += 1
        count_hot = 0
        hot_news = []
        comp.ishot = len(list) > 3

        for li in list:
            if not li.find("a"):
                continue
            if not li.find("div", class_="bar01"):
                continue

            txt = li.find("a").text;
            link = li.find("a").get("href")
            date = li.find("div", class_="bar01").text

            date = date.split(":").pop()

            cdate = datetime.strptime(date, "%Y-%m-%d %H:%M")

            # print("== %s=== %s=====+++" % (txt, cdate))
            stamp_new = cdate.timestamp()

            if stamp_now - stamp_new < 24 * 60 * 60 * 2:
                n = News(txt, date, link)
                hot_news.append(n)

        # print("finished get stock: %s ;hot new:%d" % (num, len(hot_news)))
        return hot_news

    else:
        print("error happend", num)
        count_fail = count_fail + 1
Example #36
0
def main():

    level = 0

    # Read program arguments
    for arg in sys.argv[1:]:
        (param, value) = arg.split('=')
        if param == '--level':
            level = int(value)

    path = os.path.dirname(os.path.realpath(__file__))

    loggingConf = open('{0}/configs/logging.yml'.format(path), 'r')
    logging.config.dictConfig(yaml.load(loggingConf))
    loggingConf.close()
    logger = logging.getLogger(LOGGER)

    logger.info('Program started')

    config = configparser.ConfigParser()
    config.read('{0}/configs/bot.ini'.format(path))

    username = config['Reddit']['username']
    password = config['Reddit']['password']
    user_agent = config['Reddit']['user-agent']
    dry_run = config['Bot'].getboolean('dry-run')

    if dry_run:
        logger.info('Running in dry run mode. Nothing will be commited')

    reddit = Reddit(username, password, user_agent, dry_run)
    history = History('{0}/{1}'.format(path, DATABASE))
    news = News()
    if level == 0:
        level = int(config['Bot']['level'])
    news_items = news.get_news_items(level)
    for item in news_items:
        url = item[0]
        title = item[1]
        degree = item[2]
        if not history.has_link_been_posted(url):
            history.add_link_as_posted(url, dry_run)
            if not reddit.post_link(get_redirect_url(url), title):
                continue
            break

    logger.info('Program done')
Example #37
0
 def __get_news_from_api(self):
     news = News()
     news_ids_from_database = [news['news_id'] for news in news.list()]
     news_from_api = fetch_data(sectionUrl)['news']
     news_ids_from_api = [news['news_id'] for news in news_from_api]
     fetch_log('fetching')
     if news_ids_from_api[0] != news_ids_from_database[0]:
         fetch_log("detected new data from api")
         matched = [idx for idx, val in enumerate(news_ids_from_api) if val == news_ids_from_database[0]]
         if len(matched) == 0:
             end = len(news_ids_from_api)
         else:
             end = matched[0]
         fetch_log("end index is %s" %end)
         for news in news_from_api[:end][::-1]:
             self.__fetch_news(news)
     else:
         fetch_log('Nothing new')
     return None
Example #38
0
    def crawl_topnews(self, lang="en"):
        """    Returns a list of News objects representing the top news from GoogleNews
            @param lang='en' the language of the news (optional)
            @returns list of News objects with the top news
        """
        params = {"cf": "all", "ned": "us", "hl": lang}
        url = self.build_url(params)
        feed = feedparser.parse(url)

        for e in feed["entries"]:

            url = urlparse(e["link"])
            clean_url = dict([part.split("=") for part in url.query.split("&")])["url"]

            e["lang"] = lang
            e["link"] = clean_url

            news = News(e)
            news.save()
Example #39
0
 def __fetch_news(self, news_data):
     fetch_log('fetch news id %d' %(news_data['news_id']))
     data = fetch_data(newsUrl + str(news_data['news_id']))
     fetch_log('fetched news id %d' %(news_data['news_id']))
     if data is None or news_data['news_id'] != data['id']:
         return None
     data['body'] = parse_news_body(data['body'])
     fetch_log('parsed news body')
     try:
         data['image'] = upload_to_qiniu(data['image'])
     except KeyError:
         data['image'] = 'default-lg.jpg'
     fetch_log('image uploaded')
     data['thumbnail'] = upload_to_qiniu(news_data['thumbnail'])
     fetch_log('thumbnail uploaded')
     data['date'] = datetime.strptime(news_data['date'], '%Y%m%d')
     news = News(news_id=int(data['id']))
     news.save(data)
     fetch_log('news %s saved' % int(data['id']))
     return data['id']
Example #40
0
def new_page():
    nes = News(app.config['dsn'])
    if request.method == 'GET':
        now = datetime.datetime.now()
        nelist = nes.get_newlist()
        return render_template('news.html', NewList = nelist, current_time=now.ctime())
    elif 'news_to_delete' in request.form:
        id_news = request.form.getlist('news_to_delete')
        for id_new in id_news:
            nes.delete_new(id_new)
        return redirect(url_for('new_page'))
    elif 'news_to_add' in request.form:
        nes.add_new(request.form['title'],request.form['content'],request.form['country'])
        return redirect(url_for('new_page'))
    elif 'news_to_update' in request.form:
        nes.update_new(request.form['id_new'], request.form['title'],request.form['content'])
        return redirect(url_for('new_page'))
    elif 'news_to_search' in request.form:
            searchList = nes.search_new(request.form['name']);
            now = datetime.datetime.now()
            nelist = nes.get_newlist()
            return render_template('news.html', NewList = nelist, SearchList = searchList, current_time=now.ctime())
Example #41
0
def get_news_by_url(url):
    news = News()
    try:
        soup = BeautifulSoup(urllib2.urlopen(url))

        #title
        title = soup.find("div", "pg-story-head md").find("h2").text
        news.set_title(title)

        #postTime
        author_posttime = soup.find("p", "dateline").text.replace("\n","").lower().replace("\t","").split("/")
        post_time = author_posttime[1].replace("pm", "").replace("am", "").strip()
        
        t_format = "%d %b %Y, %I:%M"
        post_time = datetime.strptime(post_time, t_format).isoformat()
        news.set_posttime(post_time)

        #author
        author = author_posttime[0]
        news.set_author(author)

        #url
        news.set_url(url)

        #date
        date = datetime.utcnow().isoformat()
        news.set_date(date)

        #source
        source = 'elfinancierocr'
        news.set_source(source)

        #content, encoding, id, country, labels
        paragraphs = soup.find("div", "pg-story-body mce").find_all('p')
        content = " ".join([unicode(p.text) for p in paragraphs])
        news.set_content(content)

        #encoding
        encoding = 'utf-8'
        news.set_encoding(encoding)

        news.news = message.add_embers_ids(news.news)

        return news.news
    except:
        log.exception("Exceptopn when extracting %s %s" % (url, sys.exc_info()[0]))
        return None
Example #42
0
def news(id):
    news = News(news_id=id)
    data = news.get()
    if data is None:
        return jsonResponse(type='404')
    return jsonResponse(data=data)
Example #43
0
def rss():
    news = News()
    data = news.sort()
    response = make_response(render_template('rss.xml', data=data, site_url=site_url))
    response.headers['Content-Type'] = 'application/atom+xml; charset=utf-8'
    return response
Example #44
0
def get_news_by_url(url):
    news = News()
    try:
        soup = BeautifulSoup(urllib2.urlopen(url))

        # title
        title = soup.find_all("h1")[0].text
        news.set_title(title)

        # postTime
        post_time = soup.select('meta[name="REVISION_DATE"]')[0]["content"]
        t_format = "%a %b %d %H:%M:%S %Z %Y"
        post_time = datetime.strptime(post_time, t_format).isoformat()
        news.set_posttime(post_time)

        # author
        author = soup.select('meta[name="Author"]')[0]["content"]
        news.set_author(author)

        # url
        news.set_url(url)

        # date
        date = datetime.utcnow().isoformat()
        news.set_date(date)

        # source
        source = "lta_reuters"
        news.set_source(source)

        # content, encoding, id, country, labels
        paragraphs = soup.find(id="resizeableText").find_all("p")
        content = " ".join([unicode(p.text) for p in paragraphs])
        news.set_content(content)

        # encoding
        encoding = "utf-8"
        news.set_encoding(encoding)

        news.news = message.add_embers_ids(news.news)

        return news.news
    except:
        log.exception("Exceptopn when extracting %s %s" % (url, sys.exc_info()[0]))
        return None
Example #45
0
# -*- coding: utf-8 -*-

import time
import threading
import feedparser
from subprocess import call
from image import Matrix
from news import News
from tetris import Tetris
import joystick

if __name__ == '__main__':
    matrix = Matrix(1, 1)
    (js, queue) = joystick.queue('/dev/input/js0')
    js.daemon = True
    js.start()
    app = News(matrix, queue)
    while True:
        if not app.run():
            if isinstance(app, News):
                app = Tetris(matrix, 0, queue)
            else:
                app = News(matrix, queue)
        time.sleep(1 / 20)