コード例 #1
0
def _to_bitpath(link):
    """ Convert URL to bitpath """
    bitlink = subr_bitly.shorten(link)
    if not bitlink:
        logging.warning("main_rss: bitly API failed")
        return
    bitlink = bitlink.replace("http://bit.ly/", "")
    bitlink = bitlink.replace("https://bit.ly/", "")
    if not re.search("^[A-Za-z0-9]+$", bitlink):
        logging.warning("main_rss: invalid bitlink <%s>; skip", bitlink)
        return
    return bitlink
コード例 #2
0
def _to_bitpath(link):
    """ Convert URL to bitpath """
    bitlink = subr_bitly.shorten(link)
    if not bitlink:
        logging.warning("main_rss: bitly API failed")
        return
    bitlink = bitlink.replace("http://bit.ly/", "")
    bitlink = bitlink.replace("https://bit.ly/", "")
    if not re.search("^[A-Za-z0-9]+$", bitlink):
        logging.warning("main_rss: invalid bitlink <%s>; skip", bitlink)
        return
    return bitlink
コード例 #3
0
def process_site(site, noisy):
    """ Process the feeds of a site """

    logging.info("")
    logging.info("* site: %s", site)
    logging.info("")

    result = subr_rss.fetch(site, noisy=noisy)
    if not result or not result[0]:
        return
    body = result[0]

    if "<rss" not in body:
        handler = sax_atom.AtomHandler()
    else:
        handler = sax_rss.RssHandler()
    sax.parseString(body, handler)

    content = zip(handler.links, handler.pub_dates)
    for link, date in content:

        if date[0] < 2013:
            continue
        if date[1] != 5:
            continue
        if date[2] < 15:
            continue

        logging.info("")
        logging.info("- <%s>", link)
        logging.info("")

        folder = subr_misc.make_post_folder(date, site)
        subr_misc.mkdir_recursive_idempotent(folder)

        time.sleep(random.randrange(5, 8))
        link = subr_bitly.shorten(link, noisy=noisy)

        filename = subr_misc.bitlink_to_filename(link)
        pname = os.sep.join([folder, filename])
        if os.path.isfile(pname):
            logging.info("main: file already exists: %s", pname)
            continue

        time.sleep(random.randrange(5, 8))
        _, body = subr_http.fetch_url(link, noisy=noisy)

        filep = open(pname, "w")
        filep.write(body)
        filep.close()
コード例 #4
0
def process_site(site, noisy):
    """ Process the feeds of a site """

    logging.info("")
    logging.info("* site: %s", site)
    logging.info("")

    result = subr_rss.fetch(site, noisy=noisy)
    if not result or not result[0]:
        return
    body = result[0]

    if "<rss" not in body:
        handler = sax_atom.AtomHandler()
    else:
        handler = sax_rss.RssHandler()
    sax.parseString(body, handler)

    content = zip(handler.links, handler.pub_dates)
    for link, date in content:

        if date[0] < 2013:
            continue
        if date[1] != 5:
            continue
        if date[2] < 15:
            continue

        logging.info("")
        logging.info("- <%s>", link)
        logging.info("")

        folder = subr_misc.make_post_folder(date, site)
        subr_misc.mkdir_recursive_idempotent(folder)

        time.sleep(random.randrange(5, 8))
        link = subr_bitly.shorten(link, noisy=noisy)

        filename = subr_misc.bitlink_to_filename(link)
        pname = os.sep.join([folder, filename])
        if os.path.isfile(pname):
            logging.info("main: file already exists: %s", pname)
            continue

        time.sleep(random.randrange(5, 8))
        _, body = subr_http.fetch_url(link, noisy=noisy)

        filep = open(pname, "w")
        filep.write(body)
        filep.close()
コード例 #5
0
def save_tweet(student, link):
    """ Save a tweet """

    # Pause a bit before the download so we sleep in any case
    time.sleep(random.random() + 0.5)

    bitlink = subr_bitly.shorten(link)
    if not bitlink:
        logging.warning("grok_tweets: bitlink API failed")
        return

    bitlink = bitlink.replace("http://bit.ly/", "")
    bitlink = bitlink.replace("https://bit.ly/", "")

    if not re.search("^[A-Za-z0-9]+$", bitlink):
        logging.warning("grok_tweets: invalid bitlink <%s>; skip", bitlink)
        return

    dirpath = os.sep.join([SETTINGS["prefix"], student, bitlink])
    if os.path.isdir(dirpath):
        logging.info("grok_tweets: dup <%s>; skip", dirpath)
        return

    cached_dirpath = rss_cache_find(bitlink)
    if not cached_dirpath:
        logging.warning("grok_tweets: can't find %s in RSS cache", bitlink)
        return
    cached_filename = rss_cache_filename(cached_dirpath)
    if not cached_filename:
        logging.warning("grok_tweets: empty %s", cached_dirpath)
        return
    cached_filepath = os.sep.join([cached_dirpath, cached_filename])

    subr_misc.mkdir_recursive_idempotent(dirpath)

    # Note: we use the time from RSS, which is more accurate
    filepath = os.sep.join([dirpath, cached_filename])

    logging.info("grok_tweets: cp '%s' '%s'", cached_filepath, filepath)
    shutil.copy(cached_filepath, filepath)