Exemple #1
0
 def getLockUri(ouri):
     '''
     Gets the lockfile uri from the given ouri
     @param ouri: osaka-uri to wrap with the lock
     '''
     parsed = urlparse.urlparse(ouri)
     parsed = parsed._replace(path=INTERLOCK_NAME_TEMPLATE.format(parsed.path.rstrip("/")))
     return parsed.geturl()
    def make_data_to_sign(self, r, auth_header):
        if self.testurl:
            testparts = urlparse(self.testurl)
            requestparts = urlparse(r.url)
            url = urlunparse(testparts[0:2] + requestparts[2:])
        else:
            url = r.url

        parsed_url = urlparse(url)
        data_to_sign = '\t'.join([
            r.method,
            parsed_url.scheme,
            parsed_url.netloc,
            # Note: relative URL constraints are handled by requests when it sets up 'r'
            parsed_url.path + ('?' + parsed_url.query if parsed_url.query else ""),
            self.canonicalize_headers(r),
            self.make_content_hash(r),
            auth_header
        ])
        logger.debug('data to sign: %s', '\\t'.join(data_to_sign.split('\t')))
        return data_to_sign
Exemple #3
0
 def online_testcase(self):
     """this test is intended to not run on every project test.
     If you want to run the test, you should specifically run this method.
     """
     google_search = GoogleSearch('essanpupil')
     google_search.start_search()
     for item in google_search.search_result:
         url = urlparse(item)
         if url.scheme is not None:
             self.assertIn('http', url.scheme)
         else:
             self.fail('Parsing failed!')
Exemple #4
0
 def _guess_atta(fb_atta, base_text):
     text = ''
     if 'type' in fb_atta and fb_atta['type'].startswith('video'):
         parsed_url = urlparse(fb_atta['url'])
         parsed_qs = parse_qs(parsed_url.query)
         if not parsed_qs['u'][0] in base_text:
             text += '\n\n' + parsed_qs['u'][0]
     elif 'subattachments' in fb_atta:
         for fb_atta_ in fb_atta['subattachments']['data']:
             image_url = fb_atta_['media']['image']['src']
             text += "\n\n![](%s)" % image_url
     elif 'media' in fb_atta:
         if 'image' in fb_atta['media']:
             image_url = fb_atta['media']['image']['src']
             text += "\n![](%s)" % image_url
     return text
Exemple #5
0
def urlallowed(url):
    if CONFIG['skip-robots-txt']:
        return True

    protocol, domain = urlparse.urlparse(url)[:2]

    for bd in ROBOTS_TXT_BLACKLIST_DOMAINS:
        if re.match(bd, domain):
            return True

    for d in ['sourceforge', 'berlios', 'github.com']:
        if d in domain:
            return True

    if protocol == 'ftp':
        return True

    baseurl = '%s://%s' % (protocol, domain)
    robotsurl = urlparse.urljoin(baseurl, 'robots.txt')

    if rpcache.has_key(baseurl):
        rp = rpcache[baseurl]
    else:
        from socket import setdefaulttimeout, getdefaulttimeout

        timeout = getdefaulttimeout()
        setdefaulttimeout(5)

        rp = robotparser.RobotFileParser()
        rp.set_url(robotsurl)
        try:
            rp.read()
            rpcache[baseurl] = rp
        except:
            rp = None

        setdefaulttimeout(timeout)

    return rp.can_fetch(CONFIG['user-agent'], url) if rp else False
Exemple #6
0
def urlallowed(url):
    if CONFIG['skip-robots-txt']:
        return True

    protocol, domain = urlparse.urlparse(url)[:2]

    for bd in ROBOTS_TXT_BLACKLIST_DOMAINS:
        if re.match(bd, domain):
            return True

    for d in ['sourceforge', 'berlios', 'github.com']:
        if d in domain:
            return True

    if protocol == 'ftp':
        return True

    baseurl = '%s://%s' % (protocol, domain)
    robotsurl = urlparse.urljoin(baseurl, 'robots.txt')

    if baseurl in rpcache:
        rp = rpcache[baseurl]
    else:
        from socket import setdefaulttimeout, getdefaulttimeout

        timeout = getdefaulttimeout()
        setdefaulttimeout(5)

        rp = robotparser.RobotFileParser()
        rp.set_url(robotsurl)
        try:
            rp.read()
            rpcache[baseurl] = rp
        except:
            rp = None

        setdefaulttimeout(timeout)

    return rp.can_fetch(CONFIG['user-agent'], url) if rp else True
Exemple #7
0
    def is_fetch_allowed_by_robots_txt(self, url):
        scheme, netloc, path, _, _, _ = urlparse(url)
        if path == '/robots.txt':
            return True
        robots_txt_url = '%s://%s/robots.txt' % (scheme, netloc)
        checker = self.robotcheckers.get(robots_txt_url)
        if checker is None:
            doc = self.fetch(robots_txt_url)
            robots_txt = doc.content if doc.status == 200 else ''
            checker = robotparser.RobotFileParser()
            checker.set_url(robots_txt_url)
            checker.parse(robots_txt)
            self.robotcheckers[robots_txt_url] = checker

        # Work around a RobotFileParser bug which makes it crash when
        # an URL contains non-ASCII characters, even when they are perfectly
        # escaped. (The library seems to make a hard-coded assumption that
        # URLs are encoded in ISO 8859-1 instead of UTF-8 before being escaped;
        # this had been true in the very early days of the web, but not
        # anymore.) To work around this bug, we double-encode the URL
        # for the purpose of robots checking; this prevents the crash.
        return checker.can_fetch(useragent=self.useragent_for_robots_txt,
                                 url=urlencode(url))
Exemple #8
0
def urlencode(url):
    p = list(urlparse(url))
    p[1] = p[1].encode('idna')
    for i in range(2, len(p)):
        p[i] = urllib.quote(p[i].encode('utf-8'))
    return urlunparse(p).encode('ascii')
Exemple #9
0
def urlpath(url):
    "'http://example.org/foo/bar.html?baz#qux' --> '/foo/bar.hml'"
    return urlparse(url)[2]
def set_url_arg(url, arg, val):
    url_parts = list(urlparse.urlparse(url))
        query = dict(urlparse.parse_qsl(url_parts[4]))
        query.update({arg : str(val)})
        url_parts[4] = urlencode(query)
        return urlparse.urlunparse(url_parts)