def _to_bitpath(link): """ Convert URL to bitpath """ bitlink = subr_bitly.shorten(link) if not bitlink: logging.warning("main_rss: bitly API failed") return bitlink = bitlink.replace("http://bit.ly/", "") bitlink = bitlink.replace("https://bit.ly/", "") if not re.search("^[A-Za-z0-9]+$", bitlink): logging.warning("main_rss: invalid bitlink <%s>; skip", bitlink) return return bitlink
def process_site(site, noisy): """ Process the feeds of a site """ logging.info("") logging.info("* site: %s", site) logging.info("") result = subr_rss.fetch(site, noisy=noisy) if not result or not result[0]: return body = result[0] if "<rss" not in body: handler = sax_atom.AtomHandler() else: handler = sax_rss.RssHandler() sax.parseString(body, handler) content = zip(handler.links, handler.pub_dates) for link, date in content: if date[0] < 2013: continue if date[1] != 5: continue if date[2] < 15: continue logging.info("") logging.info("- <%s>", link) logging.info("") folder = subr_misc.make_post_folder(date, site) subr_misc.mkdir_recursive_idempotent(folder) time.sleep(random.randrange(5, 8)) link = subr_bitly.shorten(link, noisy=noisy) filename = subr_misc.bitlink_to_filename(link) pname = os.sep.join([folder, filename]) if os.path.isfile(pname): logging.info("main: file already exists: %s", pname) continue time.sleep(random.randrange(5, 8)) _, body = subr_http.fetch_url(link, noisy=noisy) filep = open(pname, "w") filep.write(body) filep.close()
def save_tweet(student, link): """ Save a tweet """ # Pause a bit before the download so we sleep in any case time.sleep(random.random() + 0.5) bitlink = subr_bitly.shorten(link) if not bitlink: logging.warning("grok_tweets: bitlink API failed") return bitlink = bitlink.replace("http://bit.ly/", "") bitlink = bitlink.replace("https://bit.ly/", "") if not re.search("^[A-Za-z0-9]+$", bitlink): logging.warning("grok_tweets: invalid bitlink <%s>; skip", bitlink) return dirpath = os.sep.join([SETTINGS["prefix"], student, bitlink]) if os.path.isdir(dirpath): logging.info("grok_tweets: dup <%s>; skip", dirpath) return cached_dirpath = rss_cache_find(bitlink) if not cached_dirpath: logging.warning("grok_tweets: can't find %s in RSS cache", bitlink) return cached_filename = rss_cache_filename(cached_dirpath) if not cached_filename: logging.warning("grok_tweets: empty %s", cached_dirpath) return cached_filepath = os.sep.join([cached_dirpath, cached_filename]) subr_misc.mkdir_recursive_idempotent(dirpath) # Note: we use the time from RSS, which is more accurate filepath = os.sep.join([dirpath, cached_filename]) logging.info("grok_tweets: cp '%s' '%s'", cached_filepath, filepath) shutil.copy(cached_filepath, filepath)