Example #1
0
    def get_topnews(results=8):
        URL = "https://ajax.googleapis.com/ajax/services/search/news?v=1.0&ned=%s&topic=%s&rsz=%d"
        editions = ("es_cl", "en_us")
        topics = {"w": "Internacional", "h": "Titulares"}
        i = 0

        for edition in editions:
            for topic in topics:
                url = URL % (edition, topic, results)
                print F, url
                response = urllib2.urlopen(url)
                data = response.read()

                news = json.loads(data)
                if news["responseStatus"] == 200:
                    for result in news["responseData"]["results"]:
                        data = {}

                        data["title"] = result["titleNoFormatting"]
                        data["locale"] = edition
                        data["date"] = result["publishedDate"]
                        data["url"] = result["url"]
                        data["type"] = "news"
                        data["id"] = md5(data["url"]).hexdigest()
                        data["content"] = ""

                        event = {}
                        event["title"] = data["title"]
                        event["locale"] = data["locale"]
                        event["description"] = result["content"]
                        event["date"] = data["date"]
                        e_id = event["id"] = md5("%s %s" % (repr(data["title"]), data["url"])).hexdigest()

                        print F, repr("Crawled news: %s" % data["title"])
                        e = Event(event)
                        e.save()

                        n = Page(data)
                        n.parent_id = e_id
                        n.save()

                        if result.has_key("relatedStories"):
                            for related in result["relatedStories"]:
                                data = {}
                                data["title"] = related["titleNoFormatting"]
                                data["locale"] = edition
                                data["date"] = related["publishedDate"]
                                data["url"] = related["url"]
                                data["id"] = md5(data["url"]).hexdigest()
                                data["type"] = "news"
                                data["content"] = ""

                                print F, repr("Related news: %s" % data["title"])
                                n = Page(data)
                                n.parent_id = e_id
                                n.save()
                                i += 1
                else:
                    print F, news["responseDetails"]

        print F, "total news collected: %d" % i