Example #1
0
    def get_top_news(self):
        tag = "[crawler/get_top_news]"

        base_url = settings.GN_BASE_URL
        editions = settings.GN_EDITIONS
        topics = settings.GN_TOPICS
        num_news = settings.GN_NUM_NEWS

        params = {
            'v': '1.0',
            'ned': '',
            'topic': '',
            'rsz': num_news
        }

        events = []
        pages = []

        for edition in editions:
            for topic in topics:
                params['ned'] = edition
                params['topic'] = topic

                url = base_url + '?' + urlencode(params)
                print tag, 'getting url', url
                response = urlopen(url)
                data = response.read()

                if data != '':
                    news = json.loads(data)
                else:
                    return None

                if news['responseStatus'] != 200:
                    return None

                for result in news['responseData']['results']:
                    data = {}
                    data['title'] = result['titleNoFormatting']
                    data['date'] = result['publishedDate']
                    data['url'] = result['url']
                    data['type'] = 'news'

                    # quitar la query para tener una unica url
                    par = urlparse.urlparse(unquote(data['url']))
                    data['url'] = quote(par.scheme + '://' + par.netloc + par.path)

                    event = {}
                    event['title'] = data['title']
                    event['locale'] = edition
                    event['description'] = result['content']
                    event['date'] = data['date']
                    event['url'] = data['url']

                    event = Event(event)
                    page = Page(data)
                    page.parent_id = event.id

                    events.append(event)
                    pages.append(page)

                    print tag, "event:", data['title']

                    if not 'relatedStories' in result:
                        continue

                    for related in result['relatedStories']:
                        data = {}
                        data['title'] = related['titleNoFormatting']
                        data['date'] = related['publishedDate']
                        data['url'] = related['url']
                        data['type'] = 'news'

                        # quitar la query para tener una unica url
                        par = urlparse.urlparse(unquote(data['url']))
                        data['url'] = quote(par.scheme + '://' + par.netloc + par.path)

                        page = Page(data)
                        page.parent_id = event.id
                        pages.append(page)

                        print tag, "page:", data['title']

        self.__save_events(events, pages)
Example #2
0
    def get_topnews(results=8):
        URL = "https://ajax.googleapis.com/ajax/services/search/news?v=1.0&ned=%s&topic=%s&rsz=%d"
        editions = ("es_cl", "en_us")
        topics = {"w": "Internacional", "h": "Titulares"}
        i = 0

        for edition in editions:
            for topic in topics:
                url = URL % (edition, topic, results)
                print F, url
                response = urllib2.urlopen(url)
                data = response.read()

                news = json.loads(data)
                if news["responseStatus"] == 200:
                    for result in news["responseData"]["results"]:
                        data = {}

                        data["title"] = result["titleNoFormatting"]
                        data["locale"] = edition
                        data["date"] = result["publishedDate"]
                        data["url"] = result["url"]
                        data["type"] = "news"
                        data["id"] = md5(data["url"]).hexdigest()
                        data["content"] = ""

                        event = {}
                        event["title"] = data["title"]
                        event["locale"] = data["locale"]
                        event["description"] = result["content"]
                        event["date"] = data["date"]
                        e_id = event["id"] = md5("%s %s" % (repr(data["title"]), data["url"])).hexdigest()

                        print F, repr("Crawled news: %s" % data["title"])
                        e = Event(event)
                        e.save()

                        n = Page(data)
                        n.parent_id = e_id
                        n.save()

                        if result.has_key("relatedStories"):
                            for related in result["relatedStories"]:
                                data = {}
                                data["title"] = related["titleNoFormatting"]
                                data["locale"] = edition
                                data["date"] = related["publishedDate"]
                                data["url"] = related["url"]
                                data["id"] = md5(data["url"]).hexdigest()
                                data["type"] = "news"
                                data["content"] = ""

                                print F, repr("Related news: %s" % data["title"])
                                n = Page(data)
                                n.parent_id = e_id
                                n.save()
                                i += 1
                else:
                    print F, news["responseDetails"]

        print F, "total news collected: %d" % i