def get_top_news(self): tag = "[crawler/get_top_news]" base_url = settings.GN_BASE_URL editions = settings.GN_EDITIONS topics = settings.GN_TOPICS num_news = settings.GN_NUM_NEWS params = { 'v': '1.0', 'ned': '', 'topic': '', 'rsz': num_news } events = [] pages = [] for edition in editions: for topic in topics: params['ned'] = edition params['topic'] = topic url = base_url + '?' + urlencode(params) print tag, 'getting url', url response = urlopen(url) data = response.read() if data != '': news = json.loads(data) else: return None if news['responseStatus'] != 200: return None for result in news['responseData']['results']: data = {} data['title'] = result['titleNoFormatting'] data['date'] = result['publishedDate'] data['url'] = result['url'] data['type'] = 'news' # quitar la query para tener una unica url par = urlparse.urlparse(unquote(data['url'])) data['url'] = quote(par.scheme + '://' + par.netloc + par.path) event = {} event['title'] = data['title'] event['locale'] = edition event['description'] = result['content'] event['date'] = data['date'] event['url'] = data['url'] event = Event(event) page = Page(data) page.parent_id = event.id events.append(event) pages.append(page) print tag, "event:", data['title'] if not 'relatedStories' in result: continue for related in result['relatedStories']: data = {} data['title'] = related['titleNoFormatting'] data['date'] = related['publishedDate'] data['url'] = related['url'] data['type'] = 'news' # quitar la query para tener una unica url par = urlparse.urlparse(unquote(data['url'])) data['url'] = quote(par.scheme + '://' + par.netloc + par.path) page = Page(data) page.parent_id = event.id pages.append(page) print tag, "page:", data['title'] self.__save_events(events, pages)
def get_topnews(results=8): URL = "https://ajax.googleapis.com/ajax/services/search/news?v=1.0&ned=%s&topic=%s&rsz=%d" editions = ("es_cl", "en_us") topics = {"w": "Internacional", "h": "Titulares"} i = 0 for edition in editions: for topic in topics: url = URL % (edition, topic, results) print F, url response = urllib2.urlopen(url) data = response.read() news = json.loads(data) if news["responseStatus"] == 200: for result in news["responseData"]["results"]: data = {} data["title"] = result["titleNoFormatting"] data["locale"] = edition data["date"] = result["publishedDate"] data["url"] = result["url"] data["type"] = "news" data["id"] = md5(data["url"]).hexdigest() data["content"] = "" event = {} event["title"] = data["title"] event["locale"] = data["locale"] event["description"] = result["content"] event["date"] = data["date"] e_id = event["id"] = md5("%s %s" % (repr(data["title"]), data["url"])).hexdigest() print F, repr("Crawled news: %s" % data["title"]) e = Event(event) e.save() n = Page(data) n.parent_id = e_id n.save() if result.has_key("relatedStories"): for related in result["relatedStories"]: data = {} data["title"] = related["titleNoFormatting"] data["locale"] = edition data["date"] = related["publishedDate"] data["url"] = related["url"] data["id"] = md5(data["url"]).hexdigest() data["type"] = "news" data["content"] = "" print F, repr("Related news: %s" % data["title"]) n = Page(data) n.parent_id = e_id n.save() i += 1 else: print F, news["responseDetails"] print F, "total news collected: %d" % i