def _getfeed(site): """ Get the RSS feed of site """ # # Note: here we must not change the encoding of body, because the # SAX library already performs this operation. # bodyvec = [] status = subr_rss.fetch(site, bodyvec, []) if status != 0: return body = "".join(bodyvec) return body
def process_site(site, noisy): """ Process the feeds of a site """ logging.info("") logging.info("* site: %s", site) logging.info("") result = subr_rss.fetch(site, noisy=noisy) if not result or not result[0]: return body = result[0] if "<rss" not in body: handler = sax_atom.AtomHandler() else: handler = sax_rss.RssHandler() sax.parseString(body, handler) content = zip(handler.links, handler.pub_dates) for link, date in content: if date[0] < 2013: continue if date[1] != 5: continue if date[2] < 15: continue logging.info("") logging.info("- <%s>", link) logging.info("") folder = subr_misc.make_post_folder(date, site) subr_misc.mkdir_recursive_idempotent(folder) time.sleep(random.randrange(5, 8)) link = subr_bitly.shorten(link, noisy=noisy) filename = subr_misc.bitlink_to_filename(link) pname = os.sep.join([folder, filename]) if os.path.isfile(pname): logging.info("main: file already exists: %s", pname) continue time.sleep(random.randrange(5, 8)) _, body = subr_http.fetch_url(link, noisy=noisy) filep = open(pname, "w") filep.write(body) filep.close()