def get(self): """Update news.""" url = 'http://www.cecyt9.ipn.mx/Paginas/inicio.aspx' html = urlopen(url).read() soup = BeautifulSoup(html, 'lxml') newsBox = soup.findAll('div', {'class': 'ca-item-main'}) news = {'news': []} for idx, notice in enumerate(newsBox): if notice.find('div', {'class': 'ca-text'}): noticeTitle = notice.find('div', {'class': 'ca-text'}).find('span').text.replace('\t', '').replace('\r', '').replace('\n', '') noticeImage = notice.find('div', {'class': 'ca-icon'}).find('img')['src'] if noticeTitle[0] == ' ': noticeTitle = noticeTitle[1:] noticeTitle = noticeTitle.capitalize() noticeLink = notice.find('a', {'class': 'ca-more'})['href'] noticeObject = { 'title': noticeTitle.encode('UTF-8'), 'link': noticeLink, 'image': noticeImage, } query = New.query(New.title == noticeTitle and New.url == noticeLink and New.image == noticeImage).fetch() if len(query) > 0: query[0].updated = datetime.now() query[0].put() else: news_object = New(title=noticeTitle, url=noticeLink, image=noticeImage) news_object.put() news['news'].append(noticeObject) self.response.headers['Content-Type'] = 'application/json' self.response.write(json.dumps(news))
def find_news(self, method='xquery', search_text='', not_in=False): import sys reload(sys) sys.setdefaultencoding('utf8') news = [] if method == '': method = 'xquery' for source in self.rss_urls.keys(): r = requests.get(self.rss_urls[source]) rss = r.content if method == 'xquery': query_s = """for $i in //item where contains(lower-case($i/title), lower-case('""" + search_text + """')) return <new>{$i/title, $i/pubDate, $i/link }</new> """ if not_in: query_s = """for $i in //item where not(contains(lower-case($i/title), lower-case('""" + search_text + """'))) return <new>{$i/title, $i/pubDate, $i/link }</new> """ news_list = sxq.execute_all(query_s, rss) for n_i in news_list: i = etree.fromstring(n_i) title = i.xpath("./title")[0] pubDate = i.xpath("./pubDate")[0] link = i.xpath("./link")[0] n = New(title=title.text, link=link.text, pubdate=pubDate.text) news.append(n) elif method == 'regexp': h = HTMLParser.HTMLParser() rss = h.unescape(rss) pattern = ur'<item>(.*?)</item>' regex = re.compile(pattern, re.DOTALL + re.UNICODE + re.IGNORECASE) for match in regex.finditer(rss): item = match.group(1) pattern = ur'<title>((.*?)' + search_text + '(.*?))</title>' if not_in: pattern = ur'<title>((.(?<!' + search_text + '))*?)</title>' regex = re.compile(pattern, re.DOTALL + re.UNICODE + re.IGNORECASE) matching = regex.search(item) if matching: title = matching.group(1) n = New(title=title) news.append(n) return news
def addnew(request): if request.POST: post = request.POST u=Usert.objects.all() for mu in u: mu1=mu new_New = New( er = mu1.Username, Information= post["news"]) new_New.save() return render_to_response("news-t.html")
def get(self): """Response to GET request.""" news = New.get_latests() news_dict = {'news': []} for new in news: news_dict['news'].append(new.to_dict()) self.response.headers.add_header("Access-Control-Allow-Origin", "*") self.response.headers['Content-Type'] = 'application/json' return self.response.write(json.dumps(news_dict))
def get_all_news(self): news = [] for source in self.rss_urls.keys(): r = requests.get(self.rss_urls[source]) root = etree.fromstring(r.content) items = root.xpath("//item") for i in items: title = i.xpath("./title")[0] pubDate = i.xpath("./pubDate")[0] link = i.xpath("./link")[0] n = New(title=title.text, link=link.text, pubdate=pubDate.text) news.append(n) return news
def insertuser(url, headline, summary, datetime, section): #将数据插入数据库中 global articleurl articleurl = url with app.app_context(): try: new_old = New(url=url, datetime=datetime, headline=headline, summary=summary, section=section) db.session.add(new_old) db.session.commit() new = New.query.filter(New.url == url).first() return new except Exception as ce: logger.error(ce)