def fetch(site, bodyvec, encodings): """ Fetch the RSS feeds of a website """ for path in PATHS: logging.info("subr_rss: try with %s for %s", path, site) del bodyvec[:] del encodings[:] headers = {} result = subr_http.retrieve("GET", "http", site, path, bodyvec, [], headers) if result != 200: continue ctype = headers.get("content-type") if not ctype: logging.warning("subr_rss: no content-type") continue ctype, encoding = subr_http.parse_content_type(ctype) if ( ctype != "application/atom+xml" and ctype != "application/rss+xml" and ctype != "text/xml" and ctype != "application/xml" ): logging.warning("subr_rss: bad content type: %s", ctype) continue encodings.append(encoding) return 0 logging.error("subr_rss: can't fetch RSS for %s", site) return -1
def _get_final_url(link): """ Returns the final URL that contains the content """ parsed = urlparse.urlsplit(link) real_link = [] status = subr_http.retrieve("HEAD", "http", parsed[1], parsed[2], [], real_link, {}) if status != 200: logging.warning("main_rss: invalid link: %s", link) return if len(real_link) != 1: logging.warning("main_rss: internal error") return return real_link[0]
def _savepost(link, pathname): """ Save post content into pathname """ parsed = urlparse.urlsplit(link) bodyvec = [] status = subr_http.retrieve("GET", "http", parsed[1], parsed[2], bodyvec, [], {}) if status != 200: logging.warning("main_rss: cannot retrieve page: %s", link) return filep = open(pathname, "w") for chunk in bodyvec: filep.write(chunk) filep.close()
def process_student_tweet(blogs, tweet, links, handle, student): """ Process a tweet from the point of view of one student """ base_url = blogs[handle] if not base_url: logging.warning("grok_tweets: cannot find url from %s", handle) return # Pause a bit before the download so we sleep in any case time.sleep(random.random() + 0.5) # Expand links before possibly prompting the operator for link in links: expanded_link = [] result = subr_http.retrieve("HEAD", "http", "t.co", link, [], expanded_link, {}) if result != 200: logging.warning("grok_tweets: broken link") continue if base_url not in expanded_link[0]: logging.info("grok_tweets: foreign link <%s>; skip", expanded_link[0]) continue parsed = urlparse.urlsplit(expanded_link[0]) if not parsed[2] or parsed[2] == "/": logging.info("grok_tweets: homepage link <%s>; skip", expanded_link[0]) continue # Otherwise there are cases of duplicate posts index = expanded_link[0].rfind("?") if index >= 0: expanded_link[0] = expanded_link[0][:index] if expanded_link[0].startswith("https://"): expanded_link[0] = expanded_link[0].replace("https://", "http://") logging.info("grok_tweets: process link %s", expanded_link[0]) if SETTINGS["dry"]: logging.warning("grok_tweets: would have saved: \"%s\"", tweet) continue save_tweet(student, expanded_link[0])
def shorten(url): """ Shorten URLs using bit.ly """ authdata = readconf() if not authdata: return orig_url = url bodyvec = [] headers = {} url = urllib.quote(url, safe="") path = "/v3/shorten?login=%s&apiKey=%s&longUrl=%s" % ( authdata["login"], authdata["api_key"], url) result = subr_http.retrieve("GET", "https", "api-ssl.bitly.com", path, bodyvec, [], headers) if result != 200: logging.warning("subr_bitly.py: can't shorten %s", orig_url) return body = "".join(bodyvec) ctype = headers.get("content-type") if not ctype: logging.warning("subr_bitly.py: no content type") return ctype, encoding = subr_http.parse_content_type(ctype) if ctype != "application/json": logging.warning("subr_bitly.py: bad content type") return if encoding: body = body.decode(encoding) dictionary = json.loads(body) if not "data" in dictionary or not "url" in dictionary["data"]: logging.warning("subr_bitly.py: invalid dictionary") return return dictionary["data"]["url"]