def parse_news(news_page): # try: news_raw = BeautifulSoup(news_page.content, features="lxml").find("div", "noticia") print(news_raw.find_all("em")) state = len(news_raw.find_all("em")) if state == 1: news = News( news_raw.find("h1", "noticia").text, news_raw.find("h2").text if news_raw.find("h2") else "", news_raw.find("em").text[4:], search_content(str(news_raw)), news_raw.find("strong").text, ).__dict__ else: news = News( news_raw.find("h1", "noticia").text, news_raw.find("h2").text if news_raw.find("h2") else "", news_raw.find_all("em")[1].text[4:], search_content(str(news_raw)), news_raw.find("strong").text, news_raw.find("em").text[14:] ).__dict__ # except: # news = None return news
def get_news(id): if id.isdigit(): news = News.get(id=id) else: news = News.get_by_alias(alias=id) if not news: return error(10003, 'news id not found') return news
def test_make_news(self): news = News() self.assertTrue(news is not None) date = '2018/03/11' # '2018/03/11' day = news.get_news_day(date) html = day.to_html() orig = news.news[news.dates.index(date)] self.assertEqual(html, orig)
def get(self): cid = 1 news_header = News.get_all(order='create_time desc', start=0, limit=7) news_latest = News.get_all(order='create_time desc', start=7) news_popular = News.get_all(order='read_count desc', start=0) loginform = LoginForm() regform = RegisterForm() return render_template('index.html', **locals())
def get(self, cid): if not cid or not Category.get(cid): abort(404) if cid == 1: return redirect(url_for('main.home')) news_header = News.get_by_category(cid, order='create_time desc', start=0, limit=7) news_latest = News.get_by_category(cid, order='create_time desc', start=7) news_popular = News.get_by_category(cid, order='read_count desc', start=0) loginform = LoginForm() regform = RegisterForm() return render_template('index.html', **locals())
def news_by_category_latest(cid): data = {} start = request.args.get('start', 0) limit = request.args.get('limit', PAGE_LIMIT) if cid == 1: # 头条内容 rs = News.get_all('create_time desc', int(start), int(limit)) else: rs = News.get_by_category(cid, 'create_time desc', int(start), int(limit)) data['count'] = len(rs) data['newslist'] = rs return data
def save(self): userInput = self.getInput() try: thumbnail = int(userInput['thumbnail']); News.create( name = userInput['name'], thumbnail = thumbnail, content = userInput['content'] ) except Exception, e: return self.error(msg = '新增失败: %s' % e, url=self.makeUrl('/admin/news/list'))
def saveNewInDatabase(): data = request.form split_category = data['category'].split("-") id_category = int(split_category[0]) category_name = split_category[1].lower() new = News() if new.create(connection): return redirect('/admin/panel') else: flash("Error while executing command to database") return redirect(url_for('.addNew'))
def get(self, nid): news = None if not nid: abort(404) news = News.get(id=nid) or News.get_by_alias(alias=nid) if not news: abort(404) news.update(news.id, 'read_count', news.read_count+1) loginform = LoginForm() regform = RegisterForm() news_popular = News.get_all(order='id', start=0) return render_template('news.html', **locals())
def save_news_day(self, orig_date): date = self.date.data orig_date = orig_date.replace('-', '/') message = self.message.data items = [] for item in self.items: if len(item.text.data) > 0: items.append((item.text.data, item.link.data, item.title.data)) news_day = NewsDay(date, message, items) if self.is_new.data == 'True': News().publish_news_day(news_day) return 'published' else: News().save_news_day(news_day, orig_date=orig_date) return 'saved'
def about_us(self): try: newsList = News.select().order_by(News.id.desc()) self.privData['NEWS_LIST'] = newsList return self.display('about-us') except Exception, e: return self.error(msg='获取企业资质相关列表失败!')
def get_news(): data = request.form skip_news = data['skip_news'] step_news = data['step_news'] news = News.select_news(connection, skip_news, step_news) output = {'news': [json.loads(new.toJSON()) for new in news]} return jsonify(output)
def news(): u = current_user() if u is None: return render_template('index.html') else: all_news = News.all_news() return render_template('news.html', all_news=all_news)
def read_from_news_api(query, from_date, to_date, query_in_title): if query_in_title: response = newsapi.get_everything(qintitle=query_in_title, from_param=from_date, to=to_date, language='en', sort_by='relevancy') else: response = newsapi.get_everything(q=query, from_param=from_date, to=to_date, language='en', sort_by='relevancy') news_list = [] for item in response['articles']: news = News(id=item['url'], title=item['title'], url=item['url'], img_url=item['urlToImage'], description=item['description'], publishedAt=item['publishedAt'], source=item['source']['name']) news_list.append(news) return news_list
async def scrape_fitba(self): GUILD_NAME = os.environ.get("GUILD_NAME") channel = discord.utils.get(self.bot.get_all_channels(), guild__name=GUILD_NAME, name=channels.OBAVIJESTI) news = [i async for i in self.fitba_scraper.parse_data()] for new in news: if channel is not None: try: notification = News( hashedUrl = hashlib.md5(new["url"].encode('utf-8')).hexdigest(), dateTime = datetime.strptime(new["date"], dtc.EU_SHORT_FORMAT), source = "fitba" ) entity = self.session.query(News) \ .filter(News.HashedUrl == notification.HashedUrl) \ .one_or_none() if entity is None: self.session.add(notification) self.session.commit() elif entity.DateTime != notification.DateTime[0]: entity.DateTime = notification.DateTime self.session.commit() else: break await channel.send(new["url"]) except SQLAlchemyError as err: print("Error: ", err) self.session.rollback()
def addNew(): data = request.form print(data) return render_template("module_admin/add_new.html", logged_in=True, categories=News.select_news_categories(connection))
def news_create(): if not auth.is_authorized(): return redirect('/login') news = News(None, request.form['title'], request.form['content'], auth.get_user().id) news_repository.create(news) return redirect('/news')
def parse(self, response): try: news_name = response.xpath( "//div[@id='wrapper']/h1[1]/span[1]/text()").extract_first( ).strip() news_id = Helper.md5(news_name) self.__add_news( News(news_id=news_id, news_name=news_name, source=self.url)) item = CommentsItem() item['news_id'] = news_id total_comments = int( re.findall( r'\d+', response.xpath( "//div[@id='content']/div/div[@class='article']/div[@class='related_info']/div[@class='mod-hd']/h2[1]/span[@class='pl']/a/text()" ).extract_first().strip())[0]) pages = int( total_comments / self.comments_per_page ) if total_comments % self.comments_per_page == 0 else int( total_comments / self.comments_per_page) + 1 # Get all comments in pages, but crawl up to max_pages if pages > self.max_pages: pages = self.max_pages urls = [f'{self.comments_url}?p={p+1}' for p in range(pages)] for c_url in urls: yield scrapy.Request(c_url, meta={'item': item}, callback=self.__parse_comments) except Exception as ex: self.logger.error( f"Exception occurred when parsing page {self.url}", ex)
def _to_db_format(self, raw_news, category, subject): news_model_args = { 'category': category, 'subject': subject, 'title': raw_news.get('title'), 'content': raw_news.get('text'), 'created_at': datetime.datetime.strptime(raw_news.get('discoverDate'), '%Y-%m-%dT%H:%M:%S.%f+0000'), 'internal_source': self.SOURCE, 'internal_source_id': raw_news.get('id'), 'language': 'en', } external_source = self._extract_external_source(raw_news) if external_source: news_model_args['external_source'] = external_source else: logger.error('News without external resource %s ' % raw_news) return News(**news_model_args)
def news_list(self): try: newsList = News.select().order_by(News.id.desc()) self.privData['NEWS_LIST'] = newsList return self.display('news-list') except Exception, e: return self.error(msg='获取行业动态列表失败!')
def delete(self): inputParams = self.getInput() try: news = News.get(News.id == int(inputParams['id'])) news.delete_instance() except Exception, e: return self.error(msg = '删除失败: %s' % e, url=self.makeUrl('/admin/news/list'))
def news_popular(): data = {} start = request.args.get('start', 0) limit = request.args.get('limit', PAGE_LIMIT) rs = News.get_all('read_count desc', int(start), int(limit)); data['count'] = len(rs) data['newslist'] = rs return data
def post(self): if not self._auth.is_authorized(): abort(401) args = news_parser.parse_args() news = News(None, args['title'], args['content'], self._auth.get_user().id) self._repository.create(news) return jsonify(news)
def news_by_category_popular(cid): data = {} start = request.args.get('start', 0) limit = request.args.get('limit', PAGE_LIMIT) rs = News.get_by_category(cid, 'read_count desc', int(start), int(limit)) data['count'] = len(rs) data['newslist'] = rs return data
def user_click_interact(): id_news = request.args.get('id_news', 'default_if_none') username_user = request.args.get('username_user', 'default_if_none') return jsonify({ 'OK': News.add_interact_with_user(connection, id_news, username_user) })
def news_retrieve(): id = request.args.get('id', 0) if id: news = News.get(id=id) if not news: return error(404, 'news not exist') return news start = request.args.get('start', 0) limit = int(request.args.get('limit', PAGE_LIMIT)) if limit > PAGE_MAX: limit = PAGE_MAX data = {} data['start'] = start data['data'] = News.get_all('create_time desc', int(start), int(limit)) data['count'] = len(data['data']) data['total'] = News.get_total() return data
def news_details(self): inputParams = self.getInput() try: newsDetails = News.get(News.id == int(inputParams['id'])) newsDetails.content = self.htmlunescape(newsDetails.content) newsDetails.createTime = newsDetails.createTime.strftime('%Y-%m-%d') self.privData['NEWS_DETAILS'] = newsDetails return self.display('news-details') except Exception, e: return self.error(msg='获取行业动态详情失败!')
def update(req): back = {'status':'ok'} q = req.GET or req.POST id = q.get('id') name = q.get('name') pdate = q.get('pdate') abs = q.get('abs') rank = q.get('rank') text = q.get('text') pic = q.get('pic') kwargs = {'name':name,'pdate':pdate,'abs':abs,'rank':rank,'text':text,'pic':pic} if id: News.objects.filter(pk=id).update(**kwargs) back['id'] = id else: h = News(**kwargs) h.save() back['id'] = h.id return to_json(back)
def news_delete(): print request.form.getlist('id') try: id = request.form['id'].split(',') except KeyError: return error(400, u'参数错误') if not News.delete(id): return error(10020, 'delete failed') return 'delete ok'
def publish_minutes(self, member_id): minutes = Minutes(self.meeting_type.data, self.meeting_date.data) link = self.save_file(self, minutes) text = 'Latest {} from {}'.format( minutes.full_type(), get_member(member_id).player.full_name()) message = self.message.data item = (text, link, 'Show minutes') news_day = NewsDay(message=message, items=[item]) News().publish_news_day(news_day) return True
def index(): # Slice of to pagination # List of filter by get args: # Example: /admin/news/?page=1&name_icontains=apple data = request.args.to_dict() # Type of filter engine_filter = {'title__icontains': str} # Prepare filter criteria = {} for k in data: if k in engine_filter: criteria[k] = engine_filter[k](data[k]) pagination = Paginate('admin.news.index', count=len(News.objects(**criteria)), per_page=10) page = pagination.get_page() newss = News.objects(**criteria)[(page-1) * 10:page * 10] return render.template('admin/news/index.html', newss=newss, pagination=pagination)
def list(self): inputParams = self.getInput() page = int(inputParams['page']) if inputParams.has_key('page') else 1 count = config.COUNT_PER_PAGE offset= (page-1)*count if page > 0 else 0 newsList = News.select().order_by(News.id.desc()) pageString = self.getPageStr(self.makeUrl('/admin/news/list'), page, count, newsList.count()) self.privData['NEWS_LIST'] = newsList.paginate(offset, offset+count) self.privData['PAGE_STRING'] = pageString return self.display('newsList')
def get_last_news(self): """ Get newsfeed with maximum id that is last added """ with self.connection.cursor() as cursor: sql = """SELECT * FROM `ow_newsfeed_action` WHERE `id`= (SELECT MAX(`id`) FROM `ow_newsfeed_action`) AND `entityType`="user-status" """ cursor.execute(sql) line = cursor.fetchone() data = json.loads(line["data"]) self.connection.commit() return News(text=data["status"])
def spider_news(type_name: str): html_content = session.get( types[type_name]['url']['index'].format(type_name)).content.decode() cnt = 0 while True: soup = BeautifulSoup(html_content, 'html.parser') a_tags = soup.select(types[type_name]['selector']['a']) for a in a_tags: re_res = re.search('(\d*).htm', a.attrs['href']) news_id = int(re_res.group(1)) # 判断重复,每页内容也有可能重复 if News.query.filter(News.type_ == types[type_name]['id'], News.id_ == news_id).all(): cnt += 1 # 兼容第一次获取,和后继更新 if cnt >= 20: return else: cnt = 0 detail_content = session.get( types[type_name]['url']['detail'].format( news_id)).content.decode() detail_soup = BeautifulSoup(detail_content, 'html.parser') title = a.attrs['title'].strip() # 时间 time_text = detail_soup.find( style="line-height:400%;color:#444444;font-size:14px") if not time_text: continue re_res = re.search('时间:(.*)作者', time_text.text) publish_time = datetime.strptime( re_res.group(1).strip(), '%Y年%m月%d日 %H:%M') # HTML 转成 Markdown news_html = detail_soup.find( "div", {"id": re.compile(r"vsb_content")}) news_html = str(news_html).replace( "/__local", "http://www.nuc.edu.cn/__local") content = text_maker.handle(news_html) news = News(type_=types[type_name]['id'], id_=news_id, title=title, publish_time=publish_time, content=content) db.session.add(news) db.session.commit() logging.info('添加:{}'.format(title)) next_page_a = soup.select_one(types[type_name]['selector']['next']) if not next_page_a: break re_res = re.search('(\d*).htm', next_page_a.attrs["href"]) html_content = session.get(types[type_name]['url']['next'].format( re_res.group(1))).content.decode()
def get_news_list_from_db(token): curs = Cursor(urlsafe=token) newss, curs, _ = News.query().order(-News.date).fetch_page(10, start_cursor=curs) if newss: newss_list = {} if curs: newss_list['pagetoken'] = curs.urlsafe() newss_list['news'] = [] for news in newss: newss_list['news'].append(news.maximize()) return newss_list return None
def get_all_news(self) -> List[News]: list_news = [] with MysqlDBCursor(self._config) as connection: cursor = connection.cursor() _sql = "SELECT * FROM news_table" cursor.execute(_sql) for news_tuple in cursor.fetchall(): list_news.append( News(news_id=int(news_tuple[0]), news_date=news_tuple[1], news_text=news_tuple[2])) return list_news
def addNews(): title = request.form["newsTitle"] text = request.form["newsText"] news = News(user_id=session["user_id"], title=title, text=text) try: db.session.add(news) db.session.commit() flash("News succsessfuly added") return redirect("/admin") except IntegrityError as e: # Check if News already exist in DB by title # return errorhandler(e) alternative variant from server eror return apology("News already exist", 400)
def news_create(): try: title = request.form['title'] alias_title = request.form['alias_title'] content = request.form['content'] cover_url = request.form['cover_url'] category_id = request.form['category_id'] auther_id = request.form['auther_id'] except KeyError: return error(400, u'参数错误') news = News.create(**locals()) if not news: return error(100021, 'create news failed') return news
def news_latest(): data = {} start = request.args.get('start', 0) limit = request.args.get('limit', PAGE_LIMIT) template = request.args.get('template', False) rs = News.get_all('create_time desc', int(start), int(limit)); data['count'] = len(rs) if template: data['template'] = render_template('component/news_loop.html', data=rs) else: data['newslist'] = rs return data
def edit(self): inputParams = self.getInput() newsID = int(inputParams['id']) news = News.get(News.id == newsID) self.privData['NEWS'] = news imagesList = Images().select() if not imagesList.count(): return self.error(msg = '请创建至少一个图片!', url=self.makeUrl('/admin/images/list')) self.privData['IMAGES_LIST'] = imagesList self.privData['CURRENT_IMG'] = news.thumbnail self.privData['SUBMIT_NAME'] = "thumbnail" return self.display('newsEdit')
def sync_news(): feed = feedparser.parse("http://feeds.feedburner.com/TheMagPiNews") old_newss = feed['items'] if not old_newss: return flask.jsonify( { 'error' : 'empty news data from MagPi' } ) , 500 for old_news in old_newss: news = News(key=News.generate_key(old_news['title'])) news.fill_from_old(old_news) news.put() return flask.jsonify( { 'status' : 'news sync done' } ), 200
def news_create(): if request.method == 'POST': data = dict( (key, request.form.getlist(key)[0]) for key in request.form.keys()) data['timestamp_publish'] = datetime.now() data['lang'] = 'en' news = news_provider.create(data) if hasattr(news, 'uuid'): return redirect('/news/' + str(news.uuid)) else: return render_template('news.html', news=News({ 'news_type': 'PressRelease', 'news_category': 'Press Release' }), updated=False)
def modify(self): inputParams= self.getInput() try: news_id = int(inputParams['id']) news = News().get(News.id == news_id) news.name = inputParams['name'] news.content = inputParams['content'] news.thumbnail = inputParams['thumbnail'] news.save() except Exception, e: return self.error(msg = '修改失败: %s' % e, url=self.makeUrl('/admin/news/list'))
def news_update(): keys = [] values = [] for k, v in request.form.iteritems(): if k != 'id': keys.append(k) values.append(v) print keys, values try: id = request.form['id'] if len(keys) == 0: raise KeyError except KeyError: return error(400, u'参数错误') # clause.decode('gb2312').encode('utf-8') news = News.update(id, keys, values) return news if news else error(10022, 'update news failed')
async def scrape_dlwms(self): GUILD_NAME = os.environ.get("GUILD_NAME") news = [i async for i in self.dlwms_scraper.parse_data()] for new in news: try: notification = News( hashedUrl = hashlib.md5(new["url"].encode('utf-8')).hexdigest(), dateTime = datetime.strptime(new["date"], dtc.EU_LONG_FORMAT), source = "dlwms" ) entity = self.session.query(News) \ .filter(News.HashedUrl == notification.HashedUrl) \ .one_or_none() if entity is None: self.session.add(notification) self.session.commit() elif entity.DateTime != notification.DateTime[0]: entity.DateTime = notification.DateTime self.session.commit() else: break embed = discord.Embed(title=new['title'], url = new['url'], colour = discord.Colour.blue().value) embed.set_author(name = GUILD_NAME, url = self.bot.user.avatar_url, icon_url = self.bot.user.avatar_url) embed.add_field(name = "Obavijest", value = new['content'], inline = False) embed.set_footer(text = f"Datum i vrijeme: {new['date']} • Autor: {new['author']}") try: channelName = self.subjects[new["subject"]] except Exception: channelName = "obavijesti" channel = discord.utils.get(self.bot.get_all_channels(), guild__name=GUILD_NAME, name=channelName) if channel is not None: await channel.send(embed = embed) except SQLAlchemyError as err: print("Error: ", err) self.session.rollback() finally: continue
def news_from_url(url): article = Article(url) article.download() article.html article.parse() str_date = str(article.publish_date) date = str_date.split(' ') if len(date) < 1: date.append('') test = { 'source_url': article.source_url, 'date': date[0], 'title': article.title, 'authors': article.authors, 'images': article.images, 'text': article.text, 'fake or real': 'N' } data = Data().get_data() return News(test, data).get_features_list()
def populate_news_day(self, news_date): is_new = news_date == 'new' self.is_new.data = is_new if is_new: news_day = NewsDay(date=datetime.date.today()) self.save.label.text = 'Publish' else: news_day = News().get_news_day(news_date) self.save.label.text = 'Save' self.date.data = parse_date(news_day.date) self.message.data = news_day.message for i in range(5): if i < len(news_day.items): item = news_day.items[i] else: item = NewsItem() item_form = NewsItemForm() item_form.text = item.text item_form.link = item.link item_form.title = item.title self.items.append_entry(item_form)
def _to_db_format(self, raw_news, category, subject): external_source = self._extract_external_source(raw_news) if not external_source: logger.info('News without external source %s' % raw_news) return None try: news_from_external_resource = self.fetch_news_from_external_source( raw_news) except Exception: logger.error('News with wrong format on external source %s' % raw_news) news_from_external_resource = {} if 'content' not in news_from_external_resource: logger.info('News without content: %s' % raw_news) return news_model_args = { 'category': category, 'subject': subject, 'title': raw_news.get('title'), 'content': news_from_external_resource['content'], 'created_at': datetime.datetime.strptime(raw_news.get('publishedAt'), '%Y-%m-%dT%H:%M:%SZ'), 'internal_source': self.SOURCE, 'internal_source_id': self._get_news_id(raw_news), 'language': 'en', 'external_source': external_source } return News(**news_model_args)
def _news_exists(self, raw_news): if News.objects(internal_source=self.SOURCE, internal_source_id=self._get_news_id(raw_news)): return True return False
def news(text): return News(text=text)
from models import db_session from models.news import News from models.news import News2 from models.news import News3 from datetime import date db_session.global_init('sqlite.db') post = News() post.fam = 'Фамилия' post.name = 'Имя' post.date = date.fromisoformat('2020-01-01') session = db_session.create_session() session.add(post) session.commit()
def news(request): return News(**request.param)
"urlToImage": "https://cdn.mos.cms.futurecdn.net/8kmncYUPhagswJQGRcZdPJ-1200-80.jpg", "publishedAt": "2020-11-13T15:10:48Z", "content": "Eight games of the Premier League season have passed and teams have started showing their form but which sides have had a truly easy start, and who can expect the next eight games to be much trickier… [+2807 chars]" }] } news_articles_array = news_articles['articles'] news_object_list = [] for article in tqdm(news_articles_array): article_object = News(title=article['title'], image=article['urlToImage'], url=article['url']) news_object_list.append(article_object) league_list = requests.get( 'https://www.thesportsdb.com/api/v1/json/1/all_leagues.php').json() league_object_list = [] countries = {} existing_team_ids = set() print("starting ...") for league in tqdm(league_list['leagues']): if league['strSport'] == 'Soccer': league_details = requests.get( f'https://www.thesportsdb.com/api/v1/json/1/lookupleague.php?id={league["idLeague"]}' ).json()
def __crawl(self, current_url): url = urlparse(current_url) logging.info("Crawling #{}: {}".format(self.__num_crawled, url.geturl())) self.__num_crawled += 1 # Ignore ressources listed in the not_parseable_resources # Its avoid dowloading file like pdf… etc content = Crawler.__get_response(current_url) if content is None: return None News(domain=self.__domain, url=current_url, content=Crawler.__convert_html_to_text(content)).save() # Found links links = self.__LINK_REGEX.findall(content) for link in links: link = link.decode("utf-8", errors="ignore") link = self.__clean_link(link) logging.debug("Found : {0}".format(link)) if link.startswith('/'): link = url.scheme + '://' + url[1] + link elif link.startswith('#'): link = url.scheme + '://' + url[1] + url[2] + link elif link.startswith(("mailto", "tel")): continue elif not link.startswith(('http', "https")): link = url.scheme + '://' + url[1] + '/' + link if "#" in link: link = link[:link.index('#')] # Parse the url to get domain and file extension parsed_link = urlparse(link) domain_link = parsed_link.netloc if link in self.__crawled_or_crawling: continue elif link in self.__urls_to_crawl: continue elif domain_link != self.__target_domain: continue elif parsed_link.path in ["", "/"]: continue elif "javascript" in link: continue elif self.__is_image(parsed_link.path): continue elif parsed_link.path.startswith("data:"): continue self.__urls_to_crawl.add(link)