def dump_clusters(): args = get_args() if args['-train'] == '': args['-train'] = 'src/resources/output' + args['-k'] w2vobj = W2V(args['-input'], args['-train'], args['-k']) news = News() articles = news.get_articles() w2vobj.train() # Sentence vectorization by averaging article_vecs = [w2vobj.get_sentence_vector_avg(article['cleaned_title']) for article in articles] # Sentence vectorization by "newtonian" method '''article_vecs = [] for article in articles: newtonian_vec = w2vobj.get_sentence_vector_newtonian(article['cleaned_title']) if newtonian_vec is not None: article_vecs.append(newtonian_vec)''' cluster_obj = Clustering(article_vecs, w2vobj) r_conn = redis.from_url(os.getenv('REDIS_URL',"redis://localhost:6379/")) if args['-cluster'] == 'agg': if args['-prune'] == 'true' or args['-prune'] == 'True': utilities.redis_kmeans_clusters(cluster_obj, articles, True, int(args['-limit']), r_conn) print("redis dump complete") else: utilities.redis_kmeans_clusters(cluster_obj, articles, False, int(args['-limit']), r_conn) print("redis dump complete") else: #TODO dump to redis utilties.print_ann_clusters(cluster_obj, articles)
class FullscreenWindow: def __init__(self): self.tk = Tk() self.tk.attributes("-fullscreen", True) self.tk.configure(background='black') self.topFrame = Frame(self.tk, background='black') self.bottomFrame = Frame(self.tk, background='black') self.topFrame.pack(side=TOP, fill=BOTH, expand=YES) self.bottomFrame.pack(side=BOTTOM, fill=BOTH, expand=YES) self.state = False self.tk.bind("<Return>", self.toggle_fullscreen) self.tk.bind("<Escape>", self.end_fullscreen) # clock self.clock = Clock(self.topFrame) self.clock.pack(side=RIGHT, anchor=N, padx=100, pady=60) # weather self.weather = Weather(self.topFrame) self.weather.pack(side=LEFT, anchor=N, pady=60, padx=100) # news self.news = News(self.bottomFrame) self.news.pack(side=LEFT, anchor=S, padx=100, pady=60) # goodmorning self.Goodmoring = Goodmorning(self.topFrame, background='black') self.Goodmoring.pack(side=BOTTOM, anchor=W) def toggle_fullscreen(self, event=None): self.state = not self.state # Just toggling the boolean self.tk.attributes("-fullscreen", self.state) return "break" def end_fullscreen(self, event=None): self.state = False self.tk.attributes("-fullscreen", self.state) return "break"
def test_news_perm(): n1 = News("asdf", 0) n2 = News("asdf_23", 0) n3 = News("asdf_", 1) n4 = News("asdf_23", 1) nv1 = NewsVector() nv1.add(n1) nv1.add(n2) nv1.label = n1.label nv2 = NewsVector() nv2.add(n3) nv2.add(n4) nv2.label = n3.label news_vecs = [[nv1, nv1], [nv2, nv2]] * 10 num_agents = 2 X, y, union = helpers.get_feature_vectors(news_vecs, num_agents) classifier, y_pred, y_true = ml.train_and_test(X, y, verbose=True) test_accuracy = (y_pred == y_true).sum() / sum(map(len, y_pred)) print(f"Test acc: {test_accuracy}") test_stat = correctly_classified p_value = permutation_test.blocked_sampled_test(y_pred, y_true, test_stat) return p_value
def index_page(): #form=search_item() #t=form.keyword.data #print(t,"hellp") ABC = News.Hotlines(News, 'trump', 'abc-news', 'en') # ABC = ABC['articles'][0] BBC = News.Hotlines(News, 'trump', 'bbc-news,the-verge', 'en') #BBC=BBC['articles'][0] CBS = News.Hotlines(News, 'trump', 'cbs-news', 'en') #CBS = CBS['articles'][0] CNN = News.Hotlines(News, 'trump', 'CNN', 'en') #CNN=CNN['articles'][0] ESPN = News.Hotlines(News, 'trump', 'espn', 'en') #ESPN = ESPN['articles'][0] FOX = News.Hotlines(News, 'trump', 'fox-news', 'en') #FOX= FOX['articles'][0] NBC = News.Hotlines(News, 'trump', 'nbc-news', 'en') #NBC = NBC['articles'][0] NYM = News.Hotlines(News, 'trump', 'new-york-magazine', 'en') #NYM = NYM['articles'][0] WSJ = News.Hotlines(News, 'trump', 'the-wall-street-journal', 'en') #WSJ = WSJ['articles'][0] TIME = News.Hotlines(News, 'trump', 'Time', 'en') #TIME=TIME['articles'][0] return render_template('index.html', ABC=ABC, BBC=BBC, CBS=CBS, CNN=CNN, ESPN=ESPN, FOX=FOX, NBC=NBC, NYM=NYM, WSJ=WSJ, TIME=TIME)
def news_test(companies): t = News( companies, output_root=r'C:\Users\zleirdahl\Desktop\PythonScripts\iex\Data\News\\', header_fields=['Date', 'Headline', 'Source', 'URL', 'Summary'], file_suffix='news') t.run()
def comment_page(): coms = Comments(app.config['dsn']) nes = News(app.config['dsn']) if request.method == 'GET': now = datetime.datetime.now() comlist = coms.get_commentlist() nelist = nes.get_newlist() return render_template('comments.html', CommentList = comlist, NewList = nelist, current_time=now.ctime()) elif 'comments_to_delete' in request.form: id_comments = request.form.getlist('comments_to_delete') for id_comment in id_comments: coms.delete_comment(id_comment) return redirect(url_for('comment_page')) elif 'comments_to_add' in request.form: id_comments = request.form.getlist('comments_to_add') for id_comment in id_comments: coms.add_comment(request.form['name'],request.form['article'],id_comment) return redirect(url_for('comment_page')) elif 'comments_to_update' in request.form: coms.update_comment(request.form['id_comment'], request.form['name'],request.form['article']) return redirect(url_for('comment_page')) elif 'comments_to_search' in request.form: searchList = coms.search_comment(request.form['name']); now = datetime.datetime.now() comlist = coms.get_commentlist() nelist = nes.get_newlist() return render_template('comments.html', CommentList = comlist, NewList = nelist, SearchList = searchList, current_time=now.ctime())
def fetch_penpai_news(): news_list = [] # 新闻列表 # 提取首页的新闻数据 index_resp = r.get(penpai_url).text index_html = etree.HTML(index_resp) news_urls = index_html.xpath( '//div[@class="news_li"]/div[@class="news_tu"]/a') # 新闻链接 imgs_urls = index_html.xpath( '//div[@class="news_li"]/div[@class="news_tu"]/a/img') # 新闻图片 overviews = index_html.xpath('//div[@class="news_li"]/p') # 新闻简介 times = index_html.xpath('//div[@class="pdtt_trbs"]/span[1]') # 新闻时间 origins = index_html.xpath('//div[@class="pdtt_trbs"]/a') # 新闻来源 for i in range(0, int(len(news_urls) / 2)): news_list.append( News(_id=news_urls[i].get('href').split('_')[-1], title=imgs_urls[i].get('alt'), overview=overviews[i].text.replace('\n', '').replace(' ', ''), url=penpai_url + news_urls[i].get('href'), image='http:' + imgs_urls[i].get('src'), publish_time=times[i].text, origin=origins[i].text).to_dict()) # 正则提取topCids topCids = '' ids = cids_pattern.search(index_resp) if topCids is not None: topCids = ids.group(1) # 设置Ajax请求头 ajax_params = { 'nodeids': 25949, 'topCids': '2840959,2840504,2840804,2841177,', } pageidx = 2 while True: ajax_params['pageidx'] = pageidx ajax_params['lastTime'] = int(round(time.time() * 1000)) resp = r.get(penpai_ajax_url, params=ajax_params, headers=penpai_headers) resp_content = resp.text print("爬取:", resp.url) results = news_pattern.findall(resp_content) for result in results: if '小时前' in result[5]: hours_before = hours_pattern.search(result[5]) if hours_before is not None: if int(hours_before.group(1)) > 12: return news_list else: news_list.append( News(_id=result[0].split('_')[-1], title=result[2], overview=result[3].replace('\n', '').replace( ' ', ''), url=penpai_url + result[0], image='http:' + result[1], publish_time=result[5], origin=result[4]).to_dict()) pageidx += 1 time.sleep(random.randint(0, 2))
def testNews(): # Try to add test url fetchWeb.test_parse_url() news = News() testurl = 'http://www.appledaily.com.tw/realtimenews/article/new/20150822/675760/' result = news.loadfromdb(testurl) return "Result: " + str(news)
def news_func(self): self.speak('Opening News.') from news import News self.news_win = News() self.news_win.show() self.speak( 'Welcome to News.\nThese are the latest international headlines according to BBC News Network.' )
def test_to_dict(): sample = News("title", "description", "published", "url", "full_text") assert sample.to_dict() == { "title": "title", "description": "description", "url": "url", "published": "published", "full_text": "full_text", }
def prep_news_data(self): if not self.news_market_data: print 'Preparing news and stock data...\n' news = News('Resources/articles.db') raw = news.db_articles() train_raw, test_raw = divide_list_by_ratio(raw) # prep_news_data returns a tuple of vectors, labels self.train_vecs, self.train_labs = self.prep_news_articles(train_raw, fit=True) self.test_vecs, self.test_labs = self.prep_news_articles(test_raw) self.news_market_data = True self.movie_review_data = False
def parse_single_url(url): content = urllib2.urlopen(url).read() if "該則即時新聞不存在" in content: return False else: soup = BeautifulSoup( content, from_encoding='utf-8', ) title = str(soup.find("h1", {"id": "h1"}).string) contents = soup.find("p", {"id": "summary"}) while "</iframe>" in contents.renderContents(): if contents.iframe.decompose() == None: break desc_contents = contents.renderContents() popularity_data = soup.find("a", attrs={"class": "function_icon clicked"}) if popularity_data == None: popularity = 0 else: popularity = parse_string_to_popularity(popularity_data.string) news_datetime = parse_string_to_datetime(soup.find("time").string) news_url = soup.find("meta", {"property": "og:url"})['content'] news_source = soup.find("meta", {"property": "og:site_name"})['content'] img_url1 = soup.find("a", attrs={"class": "t1"}) img_url2 = soup.find("figure", attrs={"class": "lbimg sgimg sglft"}) if img_url1 != None: img_url = img_url1.img['src'] elif img_url2 != None: img_url = img_url2.a.img['src'] else: img_url = "" logging.debug("news_url: " + str(news_url)) logging.debug("title: " + str(title)) logging.debug("content: " + str(desc_contents)) logging.debug("popularity: " + str(popularity)) logging.debug("news_datetime: " + str(news_datetime)) logging.debug("news_first_image_url: " + str(img_url)) logging.debug("news_source: " + str(news_source)) news = News(news_url=news_url, title=title, content=desc_contents, popularity=popularity, news_datetime=news_datetime, news_first_image_url=img_url, news_source=news_source) logging.info("Add news: " + str(news)) news.writetodb() return True
def index(): news = News() page = request.args.get('page') if page is None: page = 1 data = news.list(page=int(page), limit=20) total = news.count() if data is None: return jsonResponse(type='404') for i, item in enumerate(data): data[i]['date'] = time.mktime(item['date'].timetuple()) return jsonResponse(data=data, extra_data=[{'total': total}])
def send_rate(message): n = News() n.find_news(news_counter) inline_markup = telebot.types.InlineKeyboardMarkup() itembtnyes = telebot.types.InlineKeyboardButton('Yes', callback_data='news_yes') itembtnno = telebot.types.InlineKeyboardButton('No', callback_data='news_no') inline_markup.row(itembtnyes, itembtnno) bot.send_message(message.chat.id, f"{n.title}\n{n.url}\n\nAre you interested in this news?", disable_web_page_preview=True, reply_markup=inline_markup)
async def on_message(message): # we do not want the bot to reply to itself if message.author == client.user: return if message.content.startswith('!hello'): msg = 'Hello {0.author.mention}'.format(message) await client.send_message(message.channel, msg) if message.content.startswith('!top-headlines'): x = News(TOKENS['news_api_token']) data=x.topNewsHeadlines() print(data)
def call_fractory_news(): lab = tk.Label(text="Fractory news",padx=5,pady=20) lab.grid(row=0,column=0,sticky="W") nws = News() # Instantiating News class from news module lst = nws.fractory() lst_values = list(lst.values()) lst_keys = list(lst.keys()) lks = [i for i in range(10)] lks[0] = tk.Label(text=f"1. {lst_keys[0]}", fg="blue", cursor="hand2") lks[0].bind('<Button-1>', lambda j: open_link(lst_values[0])) lks[0].grid(row=1,column=0,sticky="W") lks[1] = tk.Label(text=f"2. {lst_keys[1]}", fg="blue", cursor="hand2") lks[1].bind('<Button-1>', lambda j: open_link(lst_values[1])) lks[1].grid(row=2,column=0, sticky="W") lks[2] = tk.Label(text=f"3. {lst_keys[2]}", fg="blue", cursor="hand2") lks[2].bind('<Button-1>', lambda j: open_link(lst_values[2])) lks[2].grid(row=3,column=0, sticky="W") lks[3] = tk.Label(text=f"4. {lst_keys[3]}", fg="blue", cursor="hand2") lks[3].bind('<Button-1>', lambda j: open_link(lst_values[3])) lks[3].grid(row=4,column=0, sticky="W") lks[4] = tk.Label(text=f"5. {lst_keys[4]}", fg="blue", cursor="hand2") lks[4].bind('<Button-1>', lambda j: open_link(lst_values[4])) lks[4].grid(row=5,column=0, sticky="W") lks[5] = tk.Label(text=f"6. {lst_keys[5]}", fg="blue", cursor="hand2") lks[5].bind('<Button-1>', lambda j: open_link(lst_values[5])) lks[5].grid(row=6,column=0, sticky="W") lks[6] = tk.Label(text=f"7. {lst_keys[6]}", fg="blue", cursor="hand2") lks[6].bind('<Button-1>', lambda j: open_link(lst_values[6])) lks[6].grid(row=7,column=0, sticky="W") lks[7] = tk.Label(text=f"8. {lst_keys[7]}", fg="blue", cursor="hand2") lks[7].bind('<Button-1>', lambda j: open_link(lst_values[7])) lks[7].grid(row=8,column=0, sticky="W") lks[8] = tk.Label(text=f"9. {lst_keys[8]}", fg="blue", cursor="hand2") lks[8].bind('<Button-1>', lambda j: open_link(lst_values[8])) lks[8].grid(row=9,column=0, sticky="W") lks[9] = tk.Label(text=f"10. {lst_keys[9]}", fg="blue", cursor="hand2") lks[9].bind('<Button-1>', lambda j: open_link(lst_values[9])) lks[9].grid(row=10,column=0, sticky="W")
def news(): posts = MyProfile(current_user.username) news = News() news.print_news(current_user.username) if request.method == 'POST': id = request.form['news_id_delete'] try: mynew = New(id, None, None, None, None, None, None, None, True, None) mynew.delete_new() except: flash('Could not delete new!') return redirect(url_for('news')) return render_template("news.html", posts=posts, news=news.news_arr)
def create_google_news(keyword_input): limit = int(input("검색할 뉴스 개수를 입력하세요 : ")) # 뉴스 데이터 딕셔너리 news_dict = google_news_clipping(keyword_input, limit) # 뉴스 객체 저장 리스트 news_items = [] # 뉴스 객체 생성 for i in range(len(news_dict['title'])): news_item = News(news_dict['title'][i], news_dict['link'][i], news_dict['contents'][i], news_dict['agency'][i], news_dict['date'][i], news_dict['time'][i]) news_items.append(news_item) # 크롤링한 기사 출력 for news_item in news_items: news_item.print() return news_items
def call_fabricator_news(): lab = tk.Label(text="The Fabricator.com news", padx=5, pady=20) lab.grid(row=11, column=0, sticky="W") nwsf=News() lstf=nwsf.fabricator() #Calling fabricator method lstf_values = list(lstf.values()) #value list from fabricator.com site* lstf_keys = list(lstf.keys())#key list from fabricator.com site lksf = [i for i in range(10)] lksf[0] = tk.Label(text=f"1. {lstf_keys[0]}", fg="blue", cursor="hand2") lksf[0].bind('<Button-1>', lambda j: open_link(lstf_values[0])) lksf[0].grid(row=12, column=0, sticky="W") lksf[1] = tk.Label(text=f"2. {lstf_keys[1]}", fg="blue", cursor="hand2") lksf[1].bind('<Button-1>', lambda j: open_link(lstf_values[1])) lksf[1].grid(row=13, column=0, sticky="W") lksf[2] = tk.Label(text=f"3. {lstf_keys[2]}", fg="blue", cursor="hand2") lksf[2].bind('<Button-1>', lambda j: open_link(lstf_values[2])) lksf[2].grid(row=14, column=0, sticky="W") lksf[3] = tk.Label(text=f"4. {lstf_keys[3]}", fg="blue", cursor="hand2") lksf[3].bind('<Button-1>', lambda j: open_link(lstf_values[3])) lksf[3].grid(row=15, column=0, sticky="W") lksf[4] = tk.Label(text=f"5. {lstf_keys[4]}", fg="blue", cursor="hand2") lksf[4].bind('<Button-1>', lambda j: open_link(lstf_values[1])) lksf[4].grid(row=16, column=0, sticky="W") lksf[5] = tk.Label(text=f"6. {lstf_keys[1]}", fg="blue", cursor="hand2") lksf[5].bind('<Button-1>', lambda j: open_link(lstf_values[5])) lksf[5].grid(row=17, column=0, sticky="W") lksf[6] = tk.Label(text=f"7. {lstf_keys[6]}", fg="blue", cursor="hand2") lksf[6].bind('<Button-1>', lambda j: open_link(lstf_values[6])) lksf[6].grid(row=18, column=0, sticky="W") lksf[7] = tk.Label(text=f"8. {lstf_keys[7]}", fg="blue", cursor="hand2") lksf[7].bind('<Button-1>', lambda j: open_link(lstf_values[7])) lksf[7].grid(row=19, column=0, sticky="W") lksf[8] = tk.Label(text=f"9. {lstf_keys[8]}", fg="blue", cursor="hand2") lksf[8].bind('<Button-1>', lambda j: open_link(lstf_values[8])) lksf[8].grid(row=20, column=0, sticky="W") lksf[9] = tk.Label(text=f"10. {lstf_keys[9]}", fg="blue", cursor="hand2") lksf[9].bind('<Button-1>', lambda j: open_link(lstf_values[9])) lksf[9].grid(row=21, column=0, sticky="W")
def main(): """ The main method of the nuncium application will initialize the execution of the program. Threads will be used to query for user input. Each window has its own thread to manage the update of its own interface. """ # UI object: The user interface of the nuncium application. ui = UI() # Integer: The height will consist of the entire screen and the width will # consist of approximately 1/5 of the screen's width. height = curses.LINES width = int(curses.COLS / 5) # String: The default news category that is displayed on startup. category = "Top Stories" # Window object: The window that will render the menu interface. menu_window = ui.window(height, width) ui.display_menu(menu_window, category, color=curses.COLOR_BLUE) # Integer: The starting position in the x-coordinate of the news window will # be rendered where the last window left off. The width of the news # window will consist of the remaining free space. x = width width = curses.COLS - width # Window object: The window that will render the news content. news_window = ui.window(height, width, x, y=0) # News object: The news aggregator of the nunicum application. news = News() news.fetch_news(ui, news_window) ui.cursor(menu_window, x=1, y=1, y2=1, current="Top Stories") # Thread object: A thread used for updating the menu and news content. menu_thread = Thread(target=update, args=(menu_window,), daemon=True) news_thread = Thread(target=update, args=(news_window,), daemon=True) menu_thread.start() news_thread.start() # Wait for the threads to finish working. while running: pass ui.cleanup()
def parse_single_url(url): content = urllib2.urlopen(url).read() if "該則即時新聞不存在" in content: return False else: soup = BeautifulSoup(content, from_encoding='utf-8',) title = str(soup.find("h1", {"id":"h1"}).string) contents = soup.find("p", {"id":"summary"}) while "</iframe>" in contents.renderContents(): if contents.iframe.decompose() == None : break desc_contents = contents.renderContents() popularity_data = soup.find("a", attrs={"class":"function_icon clicked"}) if popularity_data == None: popularity = 0 else: popularity = parse_string_to_popularity(popularity_data.string) news_datetime = parse_string_to_datetime(soup.find("time").string) news_url = soup.find("meta", {"property":"og:url"})['content'] news_source = soup.find("meta", {"property":"og:site_name"})['content'] img_url1 = soup.find("a",attrs={"class":"t1"}) img_url2 = soup.find("figure", attrs={"class":"lbimg sgimg sglft"}) if img_url1 != None: img_url = img_url1.img['src'] elif img_url2 != None: img_url = img_url2.a.img['src'] else: img_url = "" logging.debug("news_url: " + str(news_url)) logging.debug("title: " + str(title)) logging.debug("content: " + str(desc_contents)) logging.debug("popularity: " + str(popularity)) logging.debug("news_datetime: " + str(news_datetime)) logging.debug("news_first_image_url: " + str(img_url)) logging.debug("news_source: " + str(news_source)) news = News(news_url=news_url, title=title, content=desc_contents, popularity=popularity, news_datetime=news_datetime, news_first_image_url=img_url, news_source=news_source) logging.info("Add news: " + str(news)) news.writetodb() return True
def __init__(self, parent, *args, **kwargs): Frame.__init__(self, parent, bg='black') self.title = 'News' self.newsHandler = News(parent.currentActiveUser.newsTopic) self.newsLabel = Label(self, text=self.title, font=("Helvetica", 28), fg="white", bg="black", justify="left") self.newsElementContainer = Frame(self, bg="black") self.parent = parent self.newsLabel.pack(side=TOP, anchor=W) self.newsElementContainer.pack(side=TOP, anchor=E)
def fetch_news(page): news_list = [] resp = r.get(ajax_url, params={ 'm': 'lists', 'a': 'ajaxNews', 'cid': 4, 'page': page }, headers=headers) print('爬取:', resp.url) if resp is not None: resp.encoding = 'utf8' rst = json.loads(resp.text[1:-1])['rst'] pq = PyQuery(rst) news_item = pq('div.item-news') for item in news_item.items(): a_url = item('div > p > a').attr('href') item_main = title_extract_pattern.search( item('div.item-main').text()) if item_main is not None: news_list.append( News(_id=a_url.split('/')[-1].replace('.html', ''), url=a_url, title=item_main.group(1), overview=item_main.group(2), publish_time=item('div.item-date').text()).to_dict()) return news_list
def fetch_iheima_news(): page = 1 news_list = [] while True: resp = r.get(iheima_url, params={ 'page': page, 'pagesize': 20 }, headers=iheima_headers) print("爬取:", resp.url) if resp is not None: resp_json = resp.json() contents = resp_json['contents'] for content in contents: # 只抓取12个小时以内的新闻 if int(round(time.time())) - int( time.mktime( time.strptime(content['published'], "%Y-%m-%d %H:%M"))) > 86400: return news_list else: news_list.append( News(_id=content['contentid'], title=content['title'], url=iheima_url[:-1] + content['url'], image=content['thumb'], publish_time=content['published'], origin=content['author'], overview=content['description']).to_dict()) page += 1
def run_bot(): """run_bot is the main function of the program""" configpath = get_abs_path_of_filepath(__file__) + "/config/config.ini" userspath = get_abs_path_of_filepath(__file__) + "/config/users.ini" newspath = get_abs_path_of_filepath(__file__) + "/config/news.ini" bot = Bot(configpath, userspath) try: dispatcher = bot.init_updater() news = News(configpath, newspath, userspath, dispatcher) bot.init_handler() bot.start_bot() news.start_thread() except KeyboardInterrupt: news.stopflag = True except Exception as exc: Log.LOGGING.error("Unhandled Exception: {}".format(exc))
def test_news_from_feed(): test_feed = { "title": "American dies of coronavirus in China; five Britons infected in French Alps", "summary": 'A 60-year-old American has died of the new coronavirus, the first confirmed non-Chinese death of the illness, U.S. officials said, as millions of Chinese began returning home after a Lunar New Year break that was extended to try to contain the outbreak.<div class="feedflare">\n<a href="http://feeds.reuters.com/~ff/reuters/topNews?a=OmyoX_6P_ok:xVts69RBOpQ:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/reuters/topNews?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.reuters.com/~ff/reuters/topNews?a=OmyoX_6P_ok:xVts69RBOpQ:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/reuters/topNews?i=OmyoX_6P_ok:xVts69RBOpQ:V_sGLiPBpWU" border="0"></img></a> <a href="http://feeds.reuters.com/~ff/reuters/topNews?a=OmyoX_6P_ok:xVts69RBOpQ:-BTjWOF_DHI"><img src="http://feeds.feedburner.com/~ff/reuters/topNews?i=OmyoX_6P_ok:xVts69RBOpQ:-BTjWOF_DHI" border="0"></img></a>\n</div><img src="http://feeds.feedburner.com/~r/reuters/topNews/~4/OmyoX_6P_ok" height="1" width="1" alt=""/>', "link": "http://feeds.reuters.com/~r/reuters/topNews/~3/OmyoX_6P_ok/american-dies-of-coronavirus-in-china-five-britons-infected-in-french-alps-idUSKBN20003J", "published": "Sat, 08 Feb 2020 12:22:37 -0500", } item_from_feed = News.from_feed(test_feed) assert ( item_from_feed.title == "American dies of coronavirus in China; five Britons infected in French Alps" ) assert ( item_from_feed.description == "A 60-year-old American has died of the new coronavirus, the first confirmed non-Chinese death of the illness, U.S. officials said, as millions of Chinese began returning home after a Lunar New Year break that was extended to try to contain the outbreak." ) assert ( item_from_feed.url == "http://feeds.reuters.com/~r/reuters/topNews/~3/OmyoX_6P_ok/american-dies-of-coronavirus-in-china-five-britons-infected-in-french-alps-idUSKBN20003J" ) assert item_from_feed.published == datetime.datetime(2020, 2, 8, 17, 22, 37, tzinfo=pytz.utc)
def news(self): """ """ if self._news is None: from news import News self._news = News(self) return self._news
def fetch_news(category): news_list = [] for i in range(0, 2): resp = r.get(data_base_url, params={ "cre": "tianyi", "mod": category, "_": int(round(time.time() * 1000)), "offset": 20 * i }, headers=headers) print('爬取:', resp.url) if resp is not None: resp_json = resp.json() data = resp_json['data'] for d in data: news_list.append( News(_id=d['uuid'], title=d['title'], overview=d['intro'], image=d['thumb'], publish_time=d['ctime'], origin=d['author'], url=d['url_https']).to_dict()) time.sleep(random.randint(0, 2)) return news_list
class Newsletter(Frame): def __init__(self, parent, *args, **kwargs): Frame.__init__(self, parent, bg='black') self.title = 'News' self.newsHandler = News(parent.currentActiveUser.newsTopic) self.newsLabel = Label(self, text=self.title, font=("Helvetica", 28), fg="white", bg="black", justify="left") self.newsElementContainer = Frame(self, bg="black") self.parent = parent self.newsLabel.pack(side=TOP, anchor=W) self.newsElementContainer.pack(side=TOP, anchor=E) def get_news(self): for widget in self.newsElementContainer.winfo_children(): widget.destroy() for newsTitle in self.newsHandler.get_news(): news_element = NewsElement(self.newsElementContainer, news_title=newsTitle) news_element.pack(side=TOP, anchor=W)
def fetch_gd_news(): news_list = [] xhs_headers['Host'] = xhs_gd_host resp = r.get(xhs_gd_url, headers=xhs_headers) resp.encoding = 'utf-8' bs = BeautifulSoup(resp.text, 'lxml') data_list = bs.find("ul", attrs={'class': 'gallery l-list-selected l-m'}) lis = data_list.findAll('li') for li in lis: l_cbox = li.find('div', attrs={'class': 'l-cbox'}) spans = l_cbox.find('div', attrs={ 'class': 'l-foot-par' }).findAll('span') news_id_result = xhs_news_id_pattern.search(li.a['href']) if news_id_result is not None: # 判断新闻的发布时间与当前时间的时间间隔,只保存12个小时以内的新闻 publish_time = spans[1].text.replace('\n', '').strip() if int(round(time.time())) - int( time.mktime( time.strptime(publish_time, "%Y-%m-%d %H:%M:%S"))) < 43200: news_list.append( News(_id=news_id_result.group(1), url=li.a['href'], title=li.a.img['alt'], image=xhs_gd_url + li.a.img['src'], origin=spans[0].text, publish_time=publish_time, overview=l_cbox.p.text).to_dict()) return news_list
def get_domain_news(self, domain, now_time): ''' domain: str, the domain of source (e.g. 'bbc.com') now_time: Datetime, the current datetime in iso form return: domain_news_result: list<News> The news from the last query until now ''' overall_count, first_page_content = self.get_domain_news_count( domain, now_time) domain_result = [] domain_result.extend(first_page_content) page_number = math.ceil(overall_count / 20) #Get the total number of pages if page_number >= 2: for page in range(2, min(page_number + 1, 5)): page_result = self.get_domain_news_at_page_n( domain, now_time, page) domain_result.extend(page_result) domain_news_result = [] for r in domain_result: news_r = News(domain, r["title"], r["description"], r["publishedAt"], r["urlToImage"], r["url"]) domain_news_result.append(news_r) return domain_news_result
def fetch_web_news_more(start_id): global data_list headers['Referer'] = web_news_url resp = r.get(load_more_base_url, params={ 'type': 'web_latest_article', 'b_id': start_id, 'per_page': 30 }, headers=headers) print("抓取:", resp.url) if resp is not None: resp_json = resp.json() items = resp_json['data']['items'] for item in items: post = item['post'] motifs = post['motifs'] motifs_name = motifs[0]['name'] if motifs is not None else '' data_list.append( News(_id=str(item['id']), title=post['title'], url=news_detail_base_url + str(post['id']), image=post['cover'], publish_time=post['published_at'], overview=post['summary'], origin=post['user']['name'] + '|' + motifs_name).to_dict()) if int(round(time.time())) - int( time.mktime( time.strptime(items[-1]['post']['published_at'], "%Y-%m-%d %H:%M:%S"))) > 86400: return None else: return fetch_web_news_more(items[-1]['id'])
def fetch_diyicaijing_news(): news_list = [] resp = r.get(diyicaijing_url, params={'page': 2}, headers=diyicaijing_headers) bs = BeautifulSoup(resp.text, 'lxml') articles = bs.findAll('article', attrs={'class': 'article-item clearfix'}) for article in articles: detail_url = diyicaijing_url[:-1] + article.a['href'] if not detail_url.endswith('subscribe'): news_content = article.div.text.replace(' ', '').replace('\n', '') text_result = msg_extract_pattern.findall(news_content) if text_result is not None: for content in text_result: news_list.append( News( _id=detail_url.split('/')[-1], url=detail_url, image=url_extract_pattern.search( article.a['style']).group(1), origin=content[0], title=content[1], publish_time=content[2], ).to_dict()) return news_list
def scrap_news_company(comp): num = comp.stock global count_fail, count_suc url = news_url + str(num).zfill(5) html = scrap_html(url) response_soup = BeautifulSoup(html, 'html.parser') list_node = response_soup.find('div', class_='ulList02') stamp_now = datetime.now().timestamp() if list_node: # print("get stock:", num) h1 = response_soup.find("h1", class_="tf") if h1: comp.name = h1.get("title") up = response_soup.find("div", class_="div002 up") if not up: up = response_soup.find("div", class_="div002 down") if up: spans = up.find_all("span") if spans: lens = len(spans) value = spans[lens-1].text comp.up =value list = list_node.find_all("li") count_suc += 1 count_hot = 0 hot_news = [] comp.ishot = len(list) > 3 for li in list: if not li.find("a"): continue if not li.find("div", class_="bar01"): continue txt = li.find("a").text; link = li.find("a").get("href") date = li.find("div", class_="bar01").text date = date.split(":").pop() cdate = datetime.strptime(date, "%Y-%m-%d %H:%M") # print("== %s=== %s=====+++" % (txt, cdate)) stamp_new = cdate.timestamp() if stamp_now - stamp_new < 24 * 60 * 60 * 2: n = News(txt, date, link) hot_news.append(n) # print("finished get stock: %s ;hot new:%d" % (num, len(hot_news))) return hot_news else: print("error happend", num) count_fail = count_fail + 1
def main(): level = 0 # Read program arguments for arg in sys.argv[1:]: (param, value) = arg.split('=') if param == '--level': level = int(value) path = os.path.dirname(os.path.realpath(__file__)) loggingConf = open('{0}/configs/logging.yml'.format(path), 'r') logging.config.dictConfig(yaml.load(loggingConf)) loggingConf.close() logger = logging.getLogger(LOGGER) logger.info('Program started') config = configparser.ConfigParser() config.read('{0}/configs/bot.ini'.format(path)) username = config['Reddit']['username'] password = config['Reddit']['password'] user_agent = config['Reddit']['user-agent'] dry_run = config['Bot'].getboolean('dry-run') if dry_run: logger.info('Running in dry run mode. Nothing will be commited') reddit = Reddit(username, password, user_agent, dry_run) history = History('{0}/{1}'.format(path, DATABASE)) news = News() if level == 0: level = int(config['Bot']['level']) news_items = news.get_news_items(level) for item in news_items: url = item[0] title = item[1] degree = item[2] if not history.has_link_been_posted(url): history.add_link_as_posted(url, dry_run) if not reddit.post_link(get_redirect_url(url), title): continue break logger.info('Program done')
def __get_news_from_api(self): news = News() news_ids_from_database = [news['news_id'] for news in news.list()] news_from_api = fetch_data(sectionUrl)['news'] news_ids_from_api = [news['news_id'] for news in news_from_api] fetch_log('fetching') if news_ids_from_api[0] != news_ids_from_database[0]: fetch_log("detected new data from api") matched = [idx for idx, val in enumerate(news_ids_from_api) if val == news_ids_from_database[0]] if len(matched) == 0: end = len(news_ids_from_api) else: end = matched[0] fetch_log("end index is %s" %end) for news in news_from_api[:end][::-1]: self.__fetch_news(news) else: fetch_log('Nothing new') return None
def crawl_topnews(self, lang="en"): """ Returns a list of News objects representing the top news from GoogleNews @param lang='en' the language of the news (optional) @returns list of News objects with the top news """ params = {"cf": "all", "ned": "us", "hl": lang} url = self.build_url(params) feed = feedparser.parse(url) for e in feed["entries"]: url = urlparse(e["link"]) clean_url = dict([part.split("=") for part in url.query.split("&")])["url"] e["lang"] = lang e["link"] = clean_url news = News(e) news.save()
def __fetch_news(self, news_data): fetch_log('fetch news id %d' %(news_data['news_id'])) data = fetch_data(newsUrl + str(news_data['news_id'])) fetch_log('fetched news id %d' %(news_data['news_id'])) if data is None or news_data['news_id'] != data['id']: return None data['body'] = parse_news_body(data['body']) fetch_log('parsed news body') try: data['image'] = upload_to_qiniu(data['image']) except KeyError: data['image'] = 'default-lg.jpg' fetch_log('image uploaded') data['thumbnail'] = upload_to_qiniu(news_data['thumbnail']) fetch_log('thumbnail uploaded') data['date'] = datetime.strptime(news_data['date'], '%Y%m%d') news = News(news_id=int(data['id'])) news.save(data) fetch_log('news %s saved' % int(data['id'])) return data['id']
def new_page(): nes = News(app.config['dsn']) if request.method == 'GET': now = datetime.datetime.now() nelist = nes.get_newlist() return render_template('news.html', NewList = nelist, current_time=now.ctime()) elif 'news_to_delete' in request.form: id_news = request.form.getlist('news_to_delete') for id_new in id_news: nes.delete_new(id_new) return redirect(url_for('new_page')) elif 'news_to_add' in request.form: nes.add_new(request.form['title'],request.form['content'],request.form['country']) return redirect(url_for('new_page')) elif 'news_to_update' in request.form: nes.update_new(request.form['id_new'], request.form['title'],request.form['content']) return redirect(url_for('new_page')) elif 'news_to_search' in request.form: searchList = nes.search_new(request.form['name']); now = datetime.datetime.now() nelist = nes.get_newlist() return render_template('news.html', NewList = nelist, SearchList = searchList, current_time=now.ctime())
def get_news_by_url(url): news = News() try: soup = BeautifulSoup(urllib2.urlopen(url)) #title title = soup.find("div", "pg-story-head md").find("h2").text news.set_title(title) #postTime author_posttime = soup.find("p", "dateline").text.replace("\n","").lower().replace("\t","").split("/") post_time = author_posttime[1].replace("pm", "").replace("am", "").strip() t_format = "%d %b %Y, %I:%M" post_time = datetime.strptime(post_time, t_format).isoformat() news.set_posttime(post_time) #author author = author_posttime[0] news.set_author(author) #url news.set_url(url) #date date = datetime.utcnow().isoformat() news.set_date(date) #source source = 'elfinancierocr' news.set_source(source) #content, encoding, id, country, labels paragraphs = soup.find("div", "pg-story-body mce").find_all('p') content = " ".join([unicode(p.text) for p in paragraphs]) news.set_content(content) #encoding encoding = 'utf-8' news.set_encoding(encoding) news.news = message.add_embers_ids(news.news) return news.news except: log.exception("Exceptopn when extracting %s %s" % (url, sys.exc_info()[0])) return None
def news(id): news = News(news_id=id) data = news.get() if data is None: return jsonResponse(type='404') return jsonResponse(data=data)
def rss(): news = News() data = news.sort() response = make_response(render_template('rss.xml', data=data, site_url=site_url)) response.headers['Content-Type'] = 'application/atom+xml; charset=utf-8' return response
def get_news_by_url(url): news = News() try: soup = BeautifulSoup(urllib2.urlopen(url)) # title title = soup.find_all("h1")[0].text news.set_title(title) # postTime post_time = soup.select('meta[name="REVISION_DATE"]')[0]["content"] t_format = "%a %b %d %H:%M:%S %Z %Y" post_time = datetime.strptime(post_time, t_format).isoformat() news.set_posttime(post_time) # author author = soup.select('meta[name="Author"]')[0]["content"] news.set_author(author) # url news.set_url(url) # date date = datetime.utcnow().isoformat() news.set_date(date) # source source = "lta_reuters" news.set_source(source) # content, encoding, id, country, labels paragraphs = soup.find(id="resizeableText").find_all("p") content = " ".join([unicode(p.text) for p in paragraphs]) news.set_content(content) # encoding encoding = "utf-8" news.set_encoding(encoding) news.news = message.add_embers_ids(news.news) return news.news except: log.exception("Exceptopn when extracting %s %s" % (url, sys.exc_info()[0])) return None
# -*- coding: utf-8 -*- import time import threading import feedparser from subprocess import call from image import Matrix from news import News from tetris import Tetris import joystick if __name__ == '__main__': matrix = Matrix(1, 1) (js, queue) = joystick.queue('/dev/input/js0') js.daemon = True js.start() app = News(matrix, queue) while True: if not app.run(): if isinstance(app, News): app = Tetris(matrix, 0, queue) else: app = News(matrix, queue) time.sleep(1 / 20)