Python retrieve Examples, rdtool.subr_http.retrieve Python Examples

Example #1

0

Show file

def fetch(site, bodyvec, encodings):
    """ Fetch the RSS feeds of a website """

    for path in PATHS:
        logging.info("subr_rss: try with %s for %s", path, site)

        del bodyvec[:]
        del encodings[:]
        headers = {}

        result = subr_http.retrieve("GET", "http", site, path, bodyvec,
                                    [], headers)
        if result != 200:
            continue

        ctype = headers.get("content-type")
        if not ctype:
            logging.warning("subr_rss: no content-type")
            continue

        ctype, encoding = subr_http.parse_content_type(ctype)
        if (
            ctype != "application/atom+xml" and
            ctype != "application/rss+xml" and
            ctype != "text/xml" and
            ctype != "application/xml"
           ):
            logging.warning("subr_rss: bad content type: %s", ctype)
            continue

        encodings.append(encoding)
        return 0

    logging.error("subr_rss: can't fetch RSS for %s", site)
    return -1

Example #2

0

Show file

def _get_final_url(link):
    """ Returns the final URL that contains the content """
    parsed = urlparse.urlsplit(link)
    real_link = []
    status = subr_http.retrieve("HEAD", "http", parsed[1], parsed[2], [],
                                real_link, {})
    if status != 200:
        logging.warning("main_rss: invalid link: %s", link)
        return
    if len(real_link) != 1:
        logging.warning("main_rss: internal error")
        return
    return real_link[0]

Example #3

0

Show file

def _savepost(link, pathname):
    """ Save post content into pathname """
    parsed = urlparse.urlsplit(link)
    bodyvec = []
    status = subr_http.retrieve("GET", "http", parsed[1], parsed[2], bodyvec,
                                [], {})
    if status != 200:
        logging.warning("main_rss: cannot retrieve page: %s", link)
        return
    filep = open(pathname, "w")
    for chunk in bodyvec:
        filep.write(chunk)
    filep.close()

Example #4

0

Show file

File: main_rss.py Project: AntonioLangiu/rivoluzionedigitale

def _get_final_url(link):
    """ Returns the final URL that contains the content """
    parsed = urlparse.urlsplit(link)
    real_link = []
    status = subr_http.retrieve("HEAD", "http", parsed[1], parsed[2],
                                [], real_link, {})
    if status != 200:
        logging.warning("main_rss: invalid link: %s", link)
        return
    if len(real_link) != 1:
        logging.warning("main_rss: internal error")
        return
    return real_link[0]

Example #5

0

Show file

File: main_rss.py Project: AntonioLangiu/rivoluzionedigitale

def _savepost(link, pathname):
    """ Save post content into pathname """
    parsed = urlparse.urlsplit(link)
    bodyvec = []
    status = subr_http.retrieve("GET", "http", parsed[1], parsed[2],
                                bodyvec, [], {})
    if status != 200:
        logging.warning("main_rss: cannot retrieve page: %s", link)
        return
    filep = open(pathname, "w")
    for chunk in bodyvec:
        filep.write(chunk)
    filep.close()

Example #6

0

Show file

def process_student_tweet(blogs, tweet, links, handle, student):
    """ Process a tweet from the point of view of one student """

    base_url = blogs[handle]
    if not base_url:
        logging.warning("grok_tweets: cannot find url from %s", handle)
        return

    # Pause a bit before the download so we sleep in any case
    time.sleep(random.random() + 0.5)

    # Expand links before possibly prompting the operator
    for link in links:
        expanded_link = []
        result = subr_http.retrieve("HEAD", "http", "t.co", link, [],
          expanded_link, {})
        if result != 200:
            logging.warning("grok_tweets: broken link")
            continue

        if base_url not in expanded_link[0]:
            logging.info("grok_tweets: foreign link <%s>; skip",
                            expanded_link[0])
            continue

        parsed = urlparse.urlsplit(expanded_link[0])
        if not parsed[2] or parsed[2] == "/":
            logging.info("grok_tweets: homepage link <%s>; skip",
                            expanded_link[0])
            continue

        # Otherwise there are cases of duplicate posts
        index = expanded_link[0].rfind("?")
        if index >= 0:
            expanded_link[0] = expanded_link[0][:index]
        if expanded_link[0].startswith("https://"):
            expanded_link[0] = expanded_link[0].replace("https://", "http://")

        logging.info("grok_tweets: process link %s", expanded_link[0])

        if SETTINGS["dry"]:
            logging.warning("grok_tweets: would have saved: \"%s\"", tweet)
            continue

        save_tweet(student, expanded_link[0])

Example #7

0

Show file

File: subr_bitly.py Project: AntonioLangiu/rivoluzionedigitale

def shorten(url):
    """ Shorten URLs using bit.ly """

    authdata = readconf()
    if not authdata:
        return

    orig_url = url

    bodyvec = []
    headers = {}

    url = urllib.quote(url, safe="")
    path = "/v3/shorten?login=%s&apiKey=%s&longUrl=%s" % (
      authdata["login"], authdata["api_key"], url)
    result = subr_http.retrieve("GET", "https", "api-ssl.bitly.com",
                                path, bodyvec, [], headers)
    if result != 200:
        logging.warning("subr_bitly.py: can't shorten %s", orig_url)
        return

    body = "".join(bodyvec)

    ctype = headers.get("content-type")
    if not ctype:
        logging.warning("subr_bitly.py: no content type")
        return
    ctype, encoding = subr_http.parse_content_type(ctype)
    if ctype != "application/json":
        logging.warning("subr_bitly.py: bad content type")
        return

    if encoding:
        body = body.decode(encoding)

    dictionary = json.loads(body)
    if not "data" in dictionary or not "url" in dictionary["data"]:
        logging.warning("subr_bitly.py: invalid dictionary")
        return

    return dictionary["data"]["url"]