Esempio n. 1
0
def parse_news(news_page):
    # try:
    news_raw = BeautifulSoup(news_page.content, features="lxml").find("div", "noticia")

    print(news_raw.find_all("em"))

    state = len(news_raw.find_all("em"))

    if state == 1:
        news = News(
            news_raw.find("h1", "noticia").text,
            news_raw.find("h2").text if news_raw.find("h2") else "",
            news_raw.find("em").text[4:],
            search_content(str(news_raw)),
            news_raw.find("strong").text,
        ).__dict__
    else:
        news = News(
            news_raw.find("h1", "noticia").text,
            news_raw.find("h2").text if news_raw.find("h2") else "",
            news_raw.find_all("em")[1].text[4:],
            search_content(str(news_raw)),
            news_raw.find("strong").text,
            news_raw.find("em").text[14:]
        ).__dict__
    # except:
    #     news = None

    return news
Esempio n. 2
0
def get_news(id):
    if id.isdigit():
        news = News.get(id=id)
    else:
        news = News.get_by_alias(alias=id)
    if not news:
        return error(10003, 'news id not found')
    return news
Esempio n. 3
0
 def test_make_news(self):
     news = News()
     self.assertTrue(news is not None)
     date = '2018/03/11'  # '2018/03/11'
     day = news.get_news_day(date)
     html = day.to_html()
     orig = news.news[news.dates.index(date)]
     self.assertEqual(html, orig)
Esempio n. 4
0
 def get(self):
     cid = 1
     news_header = News.get_all(order='create_time desc', start=0, limit=7)
     news_latest = News.get_all(order='create_time desc', start=7)
     news_popular = News.get_all(order='read_count desc', start=0)
     loginform = LoginForm()
     regform = RegisterForm()
     return render_template('index.html', **locals())
Esempio n. 5
0
 def get(self, cid):
     if not cid or not Category.get(cid):
         abort(404)
     if cid == 1:
         return redirect(url_for('main.home'))
     news_header = News.get_by_category(cid, order='create_time desc', start=0, limit=7)
     news_latest = News.get_by_category(cid, order='create_time desc', start=7)
     news_popular = News.get_by_category(cid, order='read_count desc', start=0)
     loginform = LoginForm()
     regform = RegisterForm()
     return render_template('index.html', **locals())
Esempio n. 6
0
def news_by_category_latest(cid):
    data = {}
    start = request.args.get('start', 0)
    limit = request.args.get('limit', PAGE_LIMIT)
    if cid == 1: # 头条内容
        rs = News.get_all('create_time desc', int(start), int(limit))
    else:
        rs = News.get_by_category(cid, 'create_time desc', int(start), int(limit))
    data['count'] = len(rs)
    data['newslist'] = rs
    return data
Esempio n. 7
0
 def save(self):
     userInput = self.getInput()  
     try:
         thumbnail = int(userInput['thumbnail']);
         News.create(
             name = userInput['name'],
             thumbnail = thumbnail,
             content = userInput['content']
         )
       
     except Exception, e:
         return self.error(msg = '新增失败: %s' % e, url=self.makeUrl('/admin/news/list'))
def saveNewInDatabase():

    data = request.form
    split_category = data['category'].split("-")
    id_category = int(split_category[0])
    category_name = split_category[1].lower()

    new = News()

    if new.create(connection):
        return redirect('/admin/panel')
    else:
        flash("Error while executing command to database")
        return redirect(url_for('.addNew'))
Esempio n. 9
0
    def get(self, nid):
        news = None
        if not nid:
            abort(404)

        news = News.get(id=nid) or News.get_by_alias(alias=nid)
        if not news:
            abort(404)

        news.update(news.id, 'read_count', news.read_count+1)
        loginform = LoginForm()
        regform = RegisterForm()
        news_popular = News.get_all(order='id', start=0)
        
        return render_template('news.html', **locals())
Esempio n. 10
0
 def save_news_day(self, orig_date):
     date = self.date.data
     orig_date = orig_date.replace('-', '/')
     message = self.message.data
     items = []
     for item in self.items:
         if len(item.text.data) > 0:
             items.append((item.text.data, item.link.data, item.title.data))
     news_day = NewsDay(date, message, items)
     if self.is_new.data == 'True':
         News().publish_news_day(news_day)
         return 'published'
     else:
         News().save_news_day(news_day, orig_date=orig_date)
         return 'saved'
Esempio n. 11
0
 def about_us(self):
     try:
         newsList = News.select().order_by(News.id.desc())
         self.privData['NEWS_LIST'] = newsList
         return self.display('about-us')
     except Exception, e:
         return self.error(msg='获取企业资质相关列表失败!')
def get_news():
    data = request.form
    skip_news = data['skip_news']
    step_news = data['step_news']
    news = News.select_news(connection, skip_news, step_news)
    output = {'news': [json.loads(new.toJSON()) for new in news]}
    return jsonify(output)
Esempio n. 13
0
def news():
    u = current_user()
    if u is None:
        return render_template('index.html')
    else:
        all_news = News.all_news()
        return render_template('news.html', all_news=all_news)
Esempio n. 14
0
def read_from_news_api(query, from_date, to_date, query_in_title):

    if query_in_title:
        response = newsapi.get_everything(qintitle=query_in_title,
                                          from_param=from_date,
                                          to=to_date,
                                          language='en',
                                          sort_by='relevancy')
    else:
        response = newsapi.get_everything(q=query,
                                          from_param=from_date,
                                          to=to_date,
                                          language='en',
                                          sort_by='relevancy')

    news_list = []
    for item in response['articles']:
        news = News(id=item['url'],
                    title=item['title'],
                    url=item['url'],
                    img_url=item['urlToImage'],
                    description=item['description'],
                    publishedAt=item['publishedAt'],
                    source=item['source']['name'])
        news_list.append(news)

    return news_list
Esempio n. 15
0
    async def scrape_fitba(self):
        GUILD_NAME = os.environ.get("GUILD_NAME")
        channel = discord.utils.get(self.bot.get_all_channels(), guild__name=GUILD_NAME, name=channels.OBAVIJESTI)

        news = [i async for i in self.fitba_scraper.parse_data()]
        for new in news:
            if channel is not None:
                try:
                    notification = News(
                        hashedUrl = hashlib.md5(new["url"].encode('utf-8')).hexdigest(),
                        dateTime = datetime.strptime(new["date"], dtc.EU_SHORT_FORMAT),
                        source = "fitba"
                    )

                    entity = self.session.query(News) \
                        .filter(News.HashedUrl == notification.HashedUrl) \
                        .one_or_none()

                    if entity is None:
                        self.session.add(notification)
                        self.session.commit()
                    elif entity.DateTime != notification.DateTime[0]:
                        entity.DateTime = notification.DateTime
                        self.session.commit()
                    else:
                        break

                    await channel.send(new["url"])

                except SQLAlchemyError as err:
                    print("Error: ", err)
                    self.session.rollback()
def addNew():
    data = request.form

    print(data)
    return render_template("module_admin/add_new.html",
                           logged_in=True,
                           categories=News.select_news_categories(connection))
Esempio n. 17
0
 def news_create():
     if not auth.is_authorized():
         return redirect('/login')
     news = News(None, request.form['title'], request.form['content'],
                 auth.get_user().id)
     news_repository.create(news)
     return redirect('/news')
 def parse(self, response):
     try:
         news_name = response.xpath(
             "//div[@id='wrapper']/h1[1]/span[1]/text()").extract_first(
             ).strip()
         news_id = Helper.md5(news_name)
         self.__add_news(
             News(news_id=news_id, news_name=news_name, source=self.url))
         item = CommentsItem()
         item['news_id'] = news_id
         total_comments = int(
             re.findall(
                 r'\d+',
                 response.xpath(
                     "//div[@id='content']/div/div[@class='article']/div[@class='related_info']/div[@class='mod-hd']/h2[1]/span[@class='pl']/a/text()"
                 ).extract_first().strip())[0])
         pages = int(
             total_comments / self.comments_per_page
         ) if total_comments % self.comments_per_page == 0 else int(
             total_comments / self.comments_per_page) + 1
         # Get all comments in pages, but crawl up to max_pages
         if pages > self.max_pages:
             pages = self.max_pages
         urls = [f'{self.comments_url}?p={p+1}' for p in range(pages)]
         for c_url in urls:
             yield scrapy.Request(c_url,
                                  meta={'item': item},
                                  callback=self.__parse_comments)
     except Exception as ex:
         self.logger.error(
             f"Exception occurred when parsing page {self.url}", ex)
Esempio n. 19
0
    def _to_db_format(self, raw_news, category, subject):
        news_model_args = {
            'category':
            category,
            'subject':
            subject,
            'title':
            raw_news.get('title'),
            'content':
            raw_news.get('text'),
            'created_at':
            datetime.datetime.strptime(raw_news.get('discoverDate'),
                                       '%Y-%m-%dT%H:%M:%S.%f+0000'),
            'internal_source':
            self.SOURCE,
            'internal_source_id':
            raw_news.get('id'),
            'language':
            'en',
        }
        external_source = self._extract_external_source(raw_news)
        if external_source:
            news_model_args['external_source'] = external_source
        else:
            logger.error('News without external resource %s ' % raw_news)

        return News(**news_model_args)
Esempio n. 20
0
 def news_list(self):
     try:
         newsList = News.select().order_by(News.id.desc())
         self.privData['NEWS_LIST'] = newsList
         return self.display('news-list')
     except Exception, e:
         return self.error(msg='获取行业动态列表失败!')
Esempio n. 21
0
    def delete(self):
        inputParams = self.getInput()

        try:
            news = News.get(News.id == int(inputParams['id']))
            news.delete_instance()
        except Exception, e:
            return self.error(msg = '删除失败: %s' % e, url=self.makeUrl('/admin/news/list'))
Esempio n. 22
0
def news_popular():
    data = {}
    start = request.args.get('start', 0)
    limit = request.args.get('limit', PAGE_LIMIT)
    rs = News.get_all('read_count desc', int(start), int(limit));
    data['count'] = len(rs)
    data['newslist'] = rs
    return data
Esempio n. 23
0
 def post(self):
     if not self._auth.is_authorized():
         abort(401)
     args = news_parser.parse_args()
     news = News(None, args['title'], args['content'],
                 self._auth.get_user().id)
     self._repository.create(news)
     return jsonify(news)
Esempio n. 24
0
def news_by_category_popular(cid):
    data = {}
    start = request.args.get('start', 0)
    limit = request.args.get('limit', PAGE_LIMIT)
    rs = News.get_by_category(cid, 'read_count desc', int(start), int(limit))
    data['count'] = len(rs)
    data['newslist'] = rs
    return data
Esempio n. 25
0
def user_click_interact():

    id_news = request.args.get('id_news', 'default_if_none')
    username_user = request.args.get('username_user', 'default_if_none')

    return jsonify({
        'OK':
        News.add_interact_with_user(connection, id_news, username_user)
    })
Esempio n. 26
0
def news_retrieve():
    id = request.args.get('id', 0)
    if id:
        news = News.get(id=id)
        if not news:
            return error(404, 'news not exist')
        return news

    start = request.args.get('start', 0)
    limit = int(request.args.get('limit', PAGE_LIMIT))
    if limit > PAGE_MAX:
        limit = PAGE_MAX
    data = {}
    data['start'] = start
    data['data'] = News.get_all('create_time desc', int(start), int(limit))
    data['count'] = len(data['data'])
    data['total'] = News.get_total()
    return data
Esempio n. 27
0
 def news_details(self):
     inputParams = self.getInput()
     try:
         newsDetails = News.get(News.id == int(inputParams['id']))
         newsDetails.content = self.htmlunescape(newsDetails.content)
         newsDetails.createTime = newsDetails.createTime.strftime('%Y-%m-%d')
         self.privData['NEWS_DETAILS'] = newsDetails
         return self.display('news-details')
     except Exception, e:
         return self.error(msg='获取行业动态详情失败!')
Esempio n. 28
0
File: news.py Progetto: jiaolj/yunht
def update(req):
    back = {'status':'ok'}
    q = req.GET or req.POST
    id = q.get('id')
    name = q.get('name')
    pdate = q.get('pdate')
    abs = q.get('abs')
    rank = q.get('rank')
    text = q.get('text')
    pic = q.get('pic')
    kwargs = {'name':name,'pdate':pdate,'abs':abs,'rank':rank,'text':text,'pic':pic}
    if id:
        News.objects.filter(pk=id).update(**kwargs)
        back['id'] = id
    else:
        h = News(**kwargs)
        h.save()
        back['id'] = h.id
    return to_json(back)
Esempio n. 29
0
def news_delete():
    print request.form.getlist('id')

    try:
        id = request.form['id'].split(',')
    except KeyError:
        return error(400, u'参数错误')
    if not News.delete(id):
        return error(10020, 'delete failed')
    return 'delete ok'
Esempio n. 30
0
 def publish_minutes(self, member_id):
     minutes = Minutes(self.meeting_type.data, self.meeting_date.data)
     link = self.save_file(self, minutes)
     text = 'Latest {} from {}'.format(
         minutes.full_type(),
         get_member(member_id).player.full_name())
     message = self.message.data
     item = (text, link, 'Show minutes')
     news_day = NewsDay(message=message, items=[item])
     News().publish_news_day(news_day)
     return True
Esempio n. 31
0
def index():
    # Slice of to pagination

    # List of filter by get args:
    # Example: /admin/news/?page=1&name_icontains=apple
    data = request.args.to_dict()

    # Type of filter
    engine_filter = {'title__icontains': str}

    # Prepare filter
    criteria = {}
    for k in data:
        if k in engine_filter:
            criteria[k] = engine_filter[k](data[k])

    pagination = Paginate('admin.news.index', count=len(News.objects(**criteria)), per_page=10)
    page = pagination.get_page()
    newss = News.objects(**criteria)[(page-1) * 10:page * 10]
    return render.template('admin/news/index.html', newss=newss, pagination=pagination)
Esempio n. 32
0
    def list(self):
        inputParams = self.getInput()
        page = int(inputParams['page']) if inputParams.has_key('page') else 1
        count = config.COUNT_PER_PAGE
        offset= (page-1)*count if page > 0 else 0

        newsList = News.select().order_by(News.id.desc())
        pageString = self.getPageStr(self.makeUrl('/admin/news/list'), page, count, newsList.count())
        self.privData['NEWS_LIST'] = newsList.paginate(offset, offset+count)
        self.privData['PAGE_STRING'] = pageString
        return self.display('newsList')
 def get_last_news(self):
     """ Get newsfeed with maximum id that is last added """
     with self.connection.cursor() as cursor:
         sql = """SELECT * FROM `ow_newsfeed_action` 
                  WHERE `id`= (SELECT MAX(`id`) FROM `ow_newsfeed_action`)
                  AND `entityType`="user-status"
                  """
         cursor.execute(sql)
         line = cursor.fetchone()
         data = json.loads(line["data"])
     self.connection.commit()
     return News(text=data["status"])
Esempio n. 34
0
def spider_news(type_name: str):
    html_content = session.get(
        types[type_name]['url']['index'].format(type_name)).content.decode()
    cnt = 0
    while True:
        soup = BeautifulSoup(html_content, 'html.parser')
        a_tags = soup.select(types[type_name]['selector']['a'])
        for a in a_tags:
            re_res = re.search('(\d*).htm', a.attrs['href'])
            news_id = int(re_res.group(1))
            # 判断重复,每页内容也有可能重复
            if News.query.filter(News.type_ == types[type_name]['id'],
                                 News.id_ == news_id).all():
                cnt += 1
                # 兼容第一次获取,和后继更新
                if cnt >= 20:
                    return
            else:
                cnt = 0
                detail_content = session.get(
                    types[type_name]['url']['detail'].format(
                        news_id)).content.decode()
                detail_soup = BeautifulSoup(detail_content, 'html.parser')

                title = a.attrs['title'].strip()
                # 时间
                time_text = detail_soup.find(
                    style="line-height:400%;color:#444444;font-size:14px")
                if not time_text:
                    continue
                re_res = re.search('时间:(.*)作者', time_text.text)
                publish_time = datetime.strptime(
                    re_res.group(1).strip(), '%Y年%m月%d日 %H:%M')
                # HTML 转成 Markdown
                news_html = detail_soup.find(
                    "div", {"id": re.compile(r"vsb_content")})
                news_html = str(news_html).replace(
                    "/__local", "http://www.nuc.edu.cn/__local")
                content = text_maker.handle(news_html)
                news = News(type_=types[type_name]['id'],
                            id_=news_id,
                            title=title,
                            publish_time=publish_time,
                            content=content)
                db.session.add(news)
                db.session.commit()
                logging.info('添加:{}'.format(title))
        next_page_a = soup.select_one(types[type_name]['selector']['next'])
        if not next_page_a:
            break
        re_res = re.search('(\d*).htm', next_page_a.attrs["href"])
        html_content = session.get(types[type_name]['url']['next'].format(
            re_res.group(1))).content.decode()
Esempio n. 35
0
def get_news_list_from_db(token):
    curs = Cursor(urlsafe=token)
    newss, curs, _ = News.query().order(-News.date).fetch_page(10, start_cursor=curs)
    if newss:
        newss_list = {}
        if curs:
            newss_list['pagetoken'] = curs.urlsafe()
        newss_list['news'] = []
        for news in newss:
            newss_list['news'].append(news.maximize())
        return newss_list
    return None
Esempio n. 36
0
 def get_all_news(self) -> List[News]:
     list_news = []
     with MysqlDBCursor(self._config) as connection:
         cursor = connection.cursor()
         _sql = "SELECT * FROM news_table"
         cursor.execute(_sql)
         for news_tuple in cursor.fetchall():
             list_news.append(
                 News(news_id=int(news_tuple[0]),
                      news_date=news_tuple[1],
                      news_text=news_tuple[2]))
     return list_news
Esempio n. 37
0
def addNews():
    title = request.form["newsTitle"]
    text = request.form["newsText"]
    news = News(user_id=session["user_id"], title=title, text=text)
    try:
        db.session.add(news)
        db.session.commit()
        flash("News succsessfuly added")
        return redirect("/admin")
    except IntegrityError as e:
        # Check if News already exist in DB by title
        # return errorhandler(e) alternative variant from server eror
        return apology("News already exist", 400)
Esempio n. 38
0
def news_create():
    try:
        title = request.form['title']
        alias_title = request.form['alias_title']
        content = request.form['content']
        cover_url = request.form['cover_url']
        category_id = request.form['category_id']
        auther_id = request.form['auther_id']
    except KeyError:
        return error(400, u'参数错误')
    news = News.create(**locals())
    if not news:
        return error(100021, 'create news failed')
    return news
Esempio n. 39
0
def news_latest():
    data = {}
    start = request.args.get('start', 0)
    limit = request.args.get('limit', PAGE_LIMIT)
    template  = request.args.get('template', False)
    rs = News.get_all('create_time desc', int(start), int(limit));
    data['count'] = len(rs)
    

    if template:
        data['template'] = render_template('component/news_loop.html', data=rs)
    else:
        data['newslist'] = rs
    
    return data
Esempio n. 40
0
    def edit(self):
        inputParams = self.getInput()
        newsID = int(inputParams['id'])
        news = News.get(News.id == newsID)
        self.privData['NEWS'] =   news

        imagesList = Images().select()
        if not imagesList.count():
            return self.error(msg = '请创建至少一个图片!', url=self.makeUrl('/admin/images/list'))


        self.privData['IMAGES_LIST'] = imagesList
        self.privData['CURRENT_IMG'] = news.thumbnail
        self.privData['SUBMIT_NAME'] = "thumbnail"

        return self.display('newsEdit')
Esempio n. 41
0
def sync_news():
    feed = feedparser.parse("http://feeds.feedburner.com/TheMagPiNews")
    old_newss = feed['items']
    if not old_newss:
        return flask.jsonify( { 'error' : 'empty news data from MagPi' } ) , 500
    for old_news in old_newss:
        news = News(key=News.generate_key(old_news['title']))
        news.fill_from_old(old_news)
        news.put()
    return flask.jsonify( { 'status' : 'news sync done' } ), 200
    
Esempio n. 42
0
def news_create():
    if request.method == 'POST':
        data = dict(
            (key, request.form.getlist(key)[0]) for key in request.form.keys())
        data['timestamp_publish'] = datetime.now()
        data['lang'] = 'en'

        news = news_provider.create(data)
        if hasattr(news, 'uuid'):
            return redirect('/news/' + str(news.uuid))
    else:
        return render_template('news.html',
                               news=News({
                                   'news_type': 'PressRelease',
                                   'news_category': 'Press Release'
                               }),
                               updated=False)
Esempio n. 43
0
 def modify(self):
     inputParams= self.getInput()
      
     try:
         news_id = int(inputParams['id'])
         news = News().get(News.id == news_id)
         news.name = inputParams['name']
         news.content = inputParams['content']
         news.thumbnail = inputParams['thumbnail']
         news.save()
     except Exception, e:
         return self.error(msg = '修改失败: %s' % e, url=self.makeUrl('/admin/news/list'))
Esempio n. 44
0
def news_update():
    keys = []
    values = []
    for k, v in request.form.iteritems():
        if k != 'id':
            keys.append(k)
            values.append(v)
    print keys, values

    try:
        id = request.form['id']
        if len(keys) == 0:
            raise KeyError
    except KeyError:
        return error(400, u'参数错误')
    # clause.decode('gb2312').encode('utf-8')
    news = News.update(id, keys, values)
    return news if news else error(10022, 'update news failed')
Esempio n. 45
0
    async def scrape_dlwms(self):
        GUILD_NAME = os.environ.get("GUILD_NAME")

        news = [i async for i in self.dlwms_scraper.parse_data()]
        for new in news:
            try:
                notification = News(
                    hashedUrl = hashlib.md5(new["url"].encode('utf-8')).hexdigest(),
                    dateTime = datetime.strptime(new["date"], dtc.EU_LONG_FORMAT),
                    source = "dlwms"
                )

                entity = self.session.query(News) \
                    .filter(News.HashedUrl == notification.HashedUrl) \
                    .one_or_none()

                if entity is None:
                    self.session.add(notification)
                    self.session.commit()
                elif entity.DateTime != notification.DateTime[0]:
                    entity.DateTime = notification.DateTime
                    self.session.commit()
                else:
                    break

                embed = discord.Embed(title=new['title'], url = new['url'], colour = discord.Colour.blue().value)
                embed.set_author(name = GUILD_NAME, url = self.bot.user.avatar_url, icon_url = self.bot.user.avatar_url)
                embed.add_field(name = "Obavijest", value = new['content'], inline = False)
                embed.set_footer(text = f"Datum i vrijeme: {new['date']} • Autor: {new['author']}")

                try:
                    channelName = self.subjects[new["subject"]]
                except Exception:
                    channelName = "obavijesti"

                channel = discord.utils.get(self.bot.get_all_channels(), guild__name=GUILD_NAME, name=channelName)
                if channel is not None:
                    await channel.send(embed = embed)
            except SQLAlchemyError as err:
                print("Error: ", err)
                self.session.rollback()
            finally:
                continue
Esempio n. 46
0
def news_from_url(url):
    article = Article(url)
    article.download()
    article.html
    article.parse()
    str_date = str(article.publish_date)
    date = str_date.split(' ')
    if len(date) < 1:
        date.append('')
    test = {
        'source_url': article.source_url,
        'date': date[0],
        'title': article.title,
        'authors': article.authors,
        'images': article.images,
        'text': article.text,
        'fake or real': 'N'
    }
    data = Data().get_data()
    return News(test, data).get_features_list()
Esempio n. 47
0
 def populate_news_day(self, news_date):
     is_new = news_date == 'new'
     self.is_new.data = is_new
     if is_new:
         news_day = NewsDay(date=datetime.date.today())
         self.save.label.text = 'Publish'
     else:
         news_day = News().get_news_day(news_date)
         self.save.label.text = 'Save'
     self.date.data = parse_date(news_day.date)
     self.message.data = news_day.message
     for i in range(5):
         if i < len(news_day.items):
             item = news_day.items[i]
         else:
             item = NewsItem()
         item_form = NewsItemForm()
         item_form.text = item.text
         item_form.link = item.link
         item_form.title = item.title
         self.items.append_entry(item_form)
Esempio n. 48
0
    def _to_db_format(self, raw_news, category, subject):
        external_source = self._extract_external_source(raw_news)
        if not external_source:
            logger.info('News without external source %s' % raw_news)
            return None

        try:
            news_from_external_resource = self.fetch_news_from_external_source(
                raw_news)
        except Exception:
            logger.error('News with wrong format on external source %s' %
                         raw_news)
            news_from_external_resource = {}

        if 'content' not in news_from_external_resource:
            logger.info('News without content: %s' % raw_news)
            return

        news_model_args = {
            'category':
            category,
            'subject':
            subject,
            'title':
            raw_news.get('title'),
            'content':
            news_from_external_resource['content'],
            'created_at':
            datetime.datetime.strptime(raw_news.get('publishedAt'),
                                       '%Y-%m-%dT%H:%M:%SZ'),
            'internal_source':
            self.SOURCE,
            'internal_source_id':
            self._get_news_id(raw_news),
            'language':
            'en',
            'external_source':
            external_source
        }
        return News(**news_model_args)
Esempio n. 49
0
 def _news_exists(self, raw_news):
     if News.objects(internal_source=self.SOURCE,
                     internal_source_id=self._get_news_id(raw_news)):
         return True
     return False
Esempio n. 50
0
def news(text):
    return News(text=text)
Esempio n. 51
0
from models import db_session
from models.news import News
from models.news import News2
from models.news import News3
from datetime import date

db_session.global_init('sqlite.db')

post = News()
post.fam = 'Фамилия'
post.name = 'Имя'
post.date = date.fromisoformat('2020-01-01')

session = db_session.create_session()
session.add(post)
session.commit()
Esempio n. 52
0
def news(request):
    return News(**request.param)
Esempio n. 53
0
            "urlToImage":
            "https://cdn.mos.cms.futurecdn.net/8kmncYUPhagswJQGRcZdPJ-1200-80.jpg",
            "publishedAt":
            "2020-11-13T15:10:48Z",
            "content":
            "Eight games of the Premier League season have passed and teams have started showing their form but which sides have had a truly easy start, and who can expect the next eight games to be much trickier… [+2807 chars]"
        }]
    }

    news_articles_array = news_articles['articles']
    news_object_list = []

    for article in tqdm(news_articles_array):

        article_object = News(title=article['title'],
                              image=article['urlToImage'],
                              url=article['url'])

        news_object_list.append(article_object)

    league_list = requests.get(
        'https://www.thesportsdb.com/api/v1/json/1/all_leagues.php').json()
    league_object_list = []
    countries = {}
    existing_team_ids = set()
    print("starting ...")
    for league in tqdm(league_list['leagues']):
        if league['strSport'] == 'Soccer':
            league_details = requests.get(
                f'https://www.thesportsdb.com/api/v1/json/1/lookupleague.php?id={league["idLeague"]}'
            ).json()
Esempio n. 54
0
    def __crawl(self, current_url):
        url = urlparse(current_url)
        logging.info("Crawling #{}: {}".format(self.__num_crawled,
                                               url.geturl()))
        self.__num_crawled += 1

        # Ignore ressources listed in the not_parseable_resources
        # Its avoid dowloading file like pdf… etc
        content = Crawler.__get_response(current_url)

        if content is None:
            return None

        News(domain=self.__domain,
             url=current_url,
             content=Crawler.__convert_html_to_text(content)).save()

        # Found links
        links = self.__LINK_REGEX.findall(content)
        for link in links:
            link = link.decode("utf-8", errors="ignore")
            link = self.__clean_link(link)
            logging.debug("Found : {0}".format(link))

            if link.startswith('/'):
                link = url.scheme + '://' + url[1] + link

            elif link.startswith('#'):
                link = url.scheme + '://' + url[1] + url[2] + link

            elif link.startswith(("mailto", "tel")):
                continue

            elif not link.startswith(('http', "https")):
                link = url.scheme + '://' + url[1] + '/' + link

            if "#" in link:
                link = link[:link.index('#')]

            # Parse the url to get domain and file extension
            parsed_link = urlparse(link)
            domain_link = parsed_link.netloc

            if link in self.__crawled_or_crawling:
                continue

            elif link in self.__urls_to_crawl:
                continue

            elif domain_link != self.__target_domain:
                continue

            elif parsed_link.path in ["", "/"]:
                continue

            elif "javascript" in link:
                continue

            elif self.__is_image(parsed_link.path):
                continue

            elif parsed_link.path.startswith("data:"):
                continue

            self.__urls_to_crawl.add(link)