Esempio n. 1
0
 def extract_links_from_a_href(self, url, href):
     """
     scheme should be in [http, https]
     Links can be:
     1. full urls (e.g. http://google.com), or
     2. reference to local items (e.g. /docs) which should translate to http://python.org/docs, or
     3. permalinks to items on same page e.g. #downloads. I will leave this part though.
     """
     valid_schemes   = ['http', 'https']
     url             = url.strip('/')
     href            = href.strip('/')
     parsed_href     = urlparse.urlsplit(href)
     skip_base       = False
     parsed_base_url = urlparse.urlsplit(url)
     temp_url        = ''
     valid_urls      = []
     if parsed_href.scheme:
         temp_url = href if (parsed_href.scheme in valid_schemes) else ''
         if parsed_href.netloc == parsed_base_url.netloc:
             skip_base = True # no need to include base url in the results. its already in the repo
     else:
         if parsed_href.path or parsed_href.query:
             temp_url = urlparse.urljoin(url, href)
             skip_base = True
     # at this point, we have something like 'http://python.org/assets/docs/about.html#somediv'
     # temp_url is one valid url.
     # now lets get some more
     if temp_url:
         valid_urls = self._find_isolated_resources(temp_url, skip_base) if temp_url else []
     return valid_urls
Esempio n. 2
0
    def find_pingback_urls(self, urls):
        """Find the pingback urls of each urls"""
        pingback_urls = {}

        for url in urls:
            try:
                page = urlopen(url)
                headers = page.info()

                if 'text/' not in headers.get('Content-Type', '').lower():
                    continue

                server_url = headers.get('X-Pingback')
                if not server_url:
                    server_url = self.find_pingback_href(page.read())

                if server_url:
                    server_url_splitted = urlsplit(server_url)
                    if not server_url_splitted.netloc:
                        url_splitted = urlsplit(url)
                        server_url = '%s://%s%s' % (url_splitted.scheme,
                                                    url_splitted.netloc,
                                                    server_url)
                    pingback_urls[url] = server_url
            except IOError:
                pass
        return pingback_urls
Esempio n. 3
0
 def _update_link_prefix(self, orig_url, prefix):
     if not prefix:
         return orig_url
     url_parts = list(urlparse.urlsplit(orig_url))
     prefix_parts = list(urlparse.urlsplit(prefix))
     url_parts[0:2] = prefix_parts[0:2]
     return urlparse.urlunsplit(url_parts)
	def get_suggestions(self, keywords, keyword_confidence):
		stackoverflow_query = keywords + " error stackoverflow"
		askubuntu_query = keywords + " error askubuntu"
		suggestions = []
		question_ids = []
		for url in search(stackoverflow_query, tld='es', lang='en', stop=5):
			hostname = urlparse.urlparse(url).hostname
			if(hostname == "stackoverflow.com"):
				path = urlparse.urlsplit(url).path
				pathx = str(path).split('/')
				question_ids.append(pathx[2])
		if len(question_ids)!=0:
			print  "#DRAK : Fetched Stackoverflow Questions\n#DRAK : Fetching answers" 
			suggestions.extend(self.so.get_suggestions(question_ids))
			print "#DRAK : Answers fetched successfully" 
		question_ids = []
		for url in search(askubuntu_query, tld='es', lang='en', stop=5):
			hostname = urlparse.urlparse(url).hostname
			if(hostname == "askubuntu.com"):
				path = urlparse.urlsplit(url).path
				pathx = str(path).split('/')
				question_ids.append(pathx[2])
		if len(question_ids)!=0:
			print  "#DRAK : Fetched AskUbuntu Questions\n#DRAK : Fetching answers" 
			suggestions.extend(self.au.get_suggestions(question_ids))
			print  "#DRAK : Answers fetched successfully" 
		
		for suggestion in suggestions:
			suggestion.keyword_confidence = keyword_confidence
		return suggestions
Esempio n. 5
0
    def test_password_reset(self):
        """
        Tests the forgotten/reset password workflow.
        """

        c = Client()

        resp = c.get(reverse('password_reset'))
        self.assertTrue(resp.status_code, 200)

        resp = c.post(reverse('password_reset'), data={'email': '*****@*****.**'})
        self.assertEqual(resp.status_code, 302)

        self.assertEqual(len(mail.outbox), 1)

        token = resp.context[0]['token']
        uid = resp.context[0]['uid']

        # Grab the token and uidb64 so that we can hit the reset url
        resp = c.get(reverse('password_reset_confirm', kwargs={'token': token, 'uidb64': uid}))
        self.assertEqual(resp.status_code, 200)
        self.assertTrue(resp.template_name.endswith('password_reset_confirm.html'))

        resp = c.post(reverse('password_reset_confirm', kwargs={'token': token, 'uidb64': uid}),
                      {'new_password1': 'mynewpassword', 'new_password2': 'mynewpassword'})
        self.assertEqual(resp.status_code, 302)
        self.assertEqual(resolve(urlsplit(resp.url).path).url_name, 'password_reset_complete')

        resp = c.post(reverse('login'), {'username': '******', 'password': '******'})

        # User is returned to the login page on error vs redirected by default
        self.assertEqual(resp.status_code, 302)
        self.assertNotEqual(resolve(urlsplit(resp.url).path).url_name, 'login')
Esempio n. 6
0
    def prepare_files(self):
        if urlparse.urlsplit(self.inurl)[0] == 'file':
            self.infname = urllib.url2pathname(urlparse.urlsplit(self.inurl)[2])
            self.infd = open(self.infname)
        else:
            # not a file url. download it.
            source = urllib.urlopen(self.inurl)
            self.infd, self.infname = tempfile.mkstemp(prefix="transcode-in-",
                suffix="." + self.inext)
            self._files_to_clean_up_on_success.append((self.infd, self.infname))
            self._files_to_clean_up_on_error.append((self.infd, self.infname))
            while True:
                chunk = source.read(1024 * 64)
                if not chunk:
                    break
                os.write(self.infd, chunk)
            os.lseek(self.infd, 0, 0)

        self.outfd, self.outfname = tempfile.mkstemp(prefix="transcode-out-",
            suffix="." + self.tofmt)
        self._files_to_clean_up_on_error.append((self.outfd, self.outfname))

        self.errfh, self.errfname = tempfile.mkstemp(prefix="transcode-",
            suffix=".log")
        self.outurl = urlparse.urlunsplit(
            ["file", None, self.outfname, None, None])
        self._files_to_clean_up_on_success.append((self.errfh, self.errfname))
        log.debug("Reading from " + self.infname + " (" + self.inurl + ")")
        log.debug("Outputting to " + self.outfname + " (" + self.outurl + ")")
        log.debug("Errors to " + self.errfname)
Esempio n. 7
0
    def __init__(self, enc_password=""):
        if enc_password == "":
            print "MtGoxHMAC: Enter your API key file encryption password."
            enc_password = getpass.getpass()  # raw_input()
        try:
            f = open("./config/salt.txt", "r")
            salt = f.read()
            f.close()
            hash_pass = hashlib.sha256(enc_password + salt).digest()

            f = open("./config/api_key.txt")
            ciphertext = f.read()
            f.close()
            decryptor = AES.new(hash_pass, AES.MODE_CBC, ciphertext[: AES.block_size])
            plaintext = decryptor.decrypt(ciphertext[AES.block_size :])
            d = json.loads(plaintext)
            self.key = d["key"]
            self.secret = d["secret"]
        except:
            print "\n\n\nError: you may have entered an invalid password or the encrypted api key file doesn't exist"
            print "If you haven't yet generated the encrypted key file, run the encrypt_api_key.py script."
            while 1:
                pass

        self.buff = ""
        self.timeout = 15
        self.__url_parts = urlparse.urlsplit("https://mtgox.com/api/0/")
        self.__url_parts_1 = urlparse.urlsplit("https://mtgox.com/api/1/")
        self.clock_window = time.time()
        self.clock = time.time()
        self.query_count = 0
        self.query_limit_per_time_slice = 5
        self.query_time_slice = 10
Esempio n. 8
0
 def validate_next(self, field):
     if field.data:
         url_next = urlsplit(field.data)
         url_base = urlsplit(request.host_url)
         if url_next.netloc and url_next.netloc != url_base.netloc:
             field.data = ''
             raise ValidationError(get_message('INVALID_REDIRECT')[0])
Esempio n. 9
0
def logout_client():
    """
    Client-initiated logout
    """
    client = Client.query.filter_by(key=request.args['client_id']).first()
    if client is None:
        # No such client. Possible CSRF. Don't logout and don't send them back
        flash(logout_errormsg, 'error')
        return redirect(url_for('index'))
    if client.trusted:
        # This is a trusted client. Does the referring domain match?
        clienthost = urlparse.urlsplit(client.redirect_uri).hostname
        if request.referrer:
            if clienthost != urlparse.urlsplit(request.referrer).hostname:
                # Doesn't. Don't logout and don't send back
                flash(logout_errormsg, 'error')
                return redirect(url_for('index'))
        # else: no referrer? Either stripped out by browser or a proxy, or this is a direct link.
        # We can't do anything about that, so assume it's a legit case.
        #
        # If there is a next destination, is it in the same domain?
        if 'next' in request.args:
            if clienthost != urlparse.urlsplit(request.args['next']).hostname:
                # Doesn't. Assume CSRF and redirect to index without logout
                flash(logout_errormsg, 'error')
                return redirect(url_for('index'))
        # All good. Log them out and send them back
        logout_internal()
        return redirect(get_next_url(external=True))
    else:
        # We know this client, but it's not trusted. Send back without logout.
        return redirect(get_next_url(external=True))
Esempio n. 10
0
    def extract_credentials(self, url):
        """
        Extracts user/password from a url.

        Returns a tuple:
            (url-without-auth, username, password)
        """
        if isinstance(url, urllib2.Request):
            result = urlparse.urlsplit(url.get_full_url())
        else:
            result = urlparse.urlsplit(url)
        scheme, netloc, path, query, frag = result

        username, password = self.parse_credentials(netloc)
        if username is None:
            return url, None, None
        elif password is None and self.prompting:
            # remove the auth credentials from the url part
            netloc = netloc.replace('%s@' % username, '', 1)
            # prompt for the password
            prompt = 'Password for %s@%s: ' % (username, netloc)
            password = urllib.quote(getpass.getpass(prompt))
        else:
            # remove the auth credentials from the url part
            netloc = netloc.replace('%s:%s@' % (username, password), '', 1)

        target_url = urlparse.urlunsplit((scheme, netloc, path, query, frag))
        return target_url, username, password
def create_yaml_profile(json_data):
    data = []
    filename = None
    if 'image_data' in json_data:
        for k, v in json_data['image_data'].items():
            filename = os.path.basename(urlparse.urlsplit(v['uri'])[2])
            abs_path = os.path.join(json_data['output'], filename)
            stdout = execute('gunzip', '-ql', abs_path)[0]
            try:
                size = int(stdout.split()[1])
            except (ValueError, KeyError) as e:
                size = None
            stdout = execute('gunzip', '-qc', abs_path, '|', 'md5sum')[0]
            try:
                md5 = stdout.split()[0]
            except (ValueError, KeyError) as e:
                md5 = None
            if not md5 or not size:
                raise Exception("Either md5 or size of %s couldn't be "
                                "calculated" % abs_path)
            data.append({k: {
                'md5': md5,
                'size': size,
                'filename': filename,
                'container': v['container'],
                'format': v['format']}})
        data.append({'repos': json_data['repos']})
    else:
        raise Exception("Couldn't find any information about images")
    filename = os.path.basename(
        urlparse.urlsplit(json_data['image_data']
                                   ['/']['uri'])[2]).split('.')[0]
    with open(os.path.join(json_data['output'], filename + '.yaml'), 'w') as f:
        f.write(yaml.dump(data))
Esempio n. 12
0
def downloadImage(url,imgTag,downloads):
		
	imgSrc = imgTag["src"]
	netloc = urlsplit(url).netloc
	scheme = urlsplit(url).scheme
	path = urlsplit(url).path
	
	print("[+] Downloading image {0}...".format(imgSrc))
	
	try:
		imgContent = urllib2.urlopen(imgSrc).read()
	except:
		if "html" in urllib2.os.path.split(path)[-1]:
			root_path = urllib2.os.path.split(path)[0].lstrip("/")
		else:
			root_path = path.lstrip("/")
		
		imgUrl = urllib2.os.path.join("{0}://{1}".format(scheme,netloc),root_path,imgSrc.lstrip("/"))
		imgContent = urllib2.urlopen(imgUrl).read()
		
	finally:
		
		imgFileName = basename(urlsplit(imgSrc)[2])
		imgFile = open(os.path.join(downloads,netloc,imgFileName),"wb")
		imgFile.write(imgContent)
		imgFile.close()

		return imgFileName
Esempio n. 13
0
 def __init__(self, uri=None, path=None):
     if uri is None:
         uri = get_request().site
     parts = list(urlparse.urlsplit(uri))
     if path is not None:
         parts[2] = urlparse.urlsplit(path)[2]
     self.uri = urlparse.urlunsplit(parts)
Esempio n. 14
0
    def __call__(self, uri):
        (scheme, netloc, path, query, frag) = urlparse.urlsplit(uri)
        # urlparse doesnt understand file URLs and stuffs everything into path
        (scheme, netloc, path, query, frag) = urlparse.urlsplit("http:" + path)
        path = os.path.normpath(path)
        schema_xml = self.schema_xml_template
        schema = ZConfig.loadSchemaFile(StringIO(schema_xml))
        config, handler = ZConfig.loadConfig(schema, path)
        for config_item in config.databases + config.storages:
            if not frag:
                # use the first defined in the file
                break
            elif frag == config_item.name:
                # match found
                break
        else:
            raise KeyError("No storage or database named %s found" % frag)

        if isinstance(config_item, ZODBDatabase):
            config = config_item.config
            factory = config.storage
            dbkw = {"connection_cache_size": config.cache_size, "connection_pool_size": config.pool_size}
            if config.database_name:
                dbkw["database_name"] = config.database_name
        else:
            factory = config_item
            dbkw = dict(cgi.parse_qsl(query))

        return factory.open, dbkw
Esempio n. 15
0
    def get_download_links(self):
        self.get_links()
        # for state, link in self.links.iteritems():
        #     if state in self.Meta.states:
        #         self.download_links[state] = link
        for state in self.states:
            self.download_links[state] = {}
            try:
                self.download_links[state]['link'] = self.links[state]
            except:
                print "failed to set link for state %s" % (state)
                self.download_links[state]['link'] = ['NA']
            try:
                self.download_links[state]['file_name'] = os.path.basename(
                                            urlsplit(self.links[state]).path)
            except:
                print "failed to set file_name for %s " % (state)
                self.download_links[state]['file_name'] = []
            try:
                self.download_links[state]['file_type'] = os.path.splitext(
                            os.path.basename(urlsplit(self.links[state]).path)
                                                                      )[1]
            except:
                print "couldnt find a type for file"


        return self.download_links
Esempio n. 16
0
 def getSource(self, url, form_data="", referer="", xml=False, mobile=False):
     url = self.fixurl(url)
     if len(referer) < 1 or referer is None:
         referer = 'http://' + urlparse.urlsplit(url).hostname
     if 'arenavision.in' in urlparse.urlsplit(url).netloc:
         self.s.headers.update({'Cookie' : 'beget=begetok'})
     if 'pushpublish' in urlparse.urlsplit(url).netloc:
         del self.s.headers['Accept-Encoding']
         
     if not referer:
         referer = url
     else:
         referer = self.fixurl(referer)
     
     headers = {'Referer': referer}
     if mobile:
         self.s.headers.update({'User-Agent' : 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'})
         
     if xml:
         headers['X-Requested-With'] = 'XMLHttpRequest'
     
     if form_data:
         r = self.s.post(url, headers=headers, data=form_data, timeout=20)
         response  = r.text
     else:
         try:
             r = self.s.get(url, headers=headers, timeout=20)
             response  = r.text
         except (requests.exceptions.MissingSchema):
             response  = 'pass'
     
     if len(response) > 10:
         if self.cookie_file:
             self.save_cookies_lwp(self.s.cookies, self.cookie_file)
     return HTMLParser().unescape(response)
Esempio n. 17
0
    def url(self, name, force=False):
        """
        Returns the real URL in DEBUG mode.
        """
        if settings.DEBUG and not force:
            hashed_name, fragment = name, ''
        else:
            clean_name, fragment = urldefrag(name)
            if urlsplit(clean_name).path.endswith('/'):  # don't hash paths
                hashed_name = name
            else:
                cache_key = self.cache_key(name)
                hashed_name = self.cache.get(cache_key)
                if hashed_name is None:
                    hashed_name = self.hashed_name(clean_name).replace('\\', '/')
                    # set the cache if there was a miss
                    # (e.g. if cache server goes down)
                    self.cache.set(cache_key, hashed_name)

        final_url = super(CachedFilesMixin, self).url(hashed_name)

        # Special casing for a @font-face hack, like url(myfont.eot?#iefix")
        # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
        query_fragment = '?#' in name  # [sic!]
        if fragment or query_fragment:
            urlparts = list(urlsplit(final_url))
            if fragment and not urlparts[4]:
                urlparts[4] = fragment
            if query_fragment and not urlparts[3]:
                urlparts[2] += '?'
            final_url = urlunsplit(urlparts)

        return unquote(final_url)
Esempio n. 18
0
def normalize_url(url, domain_canonical=None):
    """
    Ensure we have a value url - raise exception if not.
    
    If given, we convert the domain to a domain_canonical
    """
    url = url.strip()
    rgURL = list(urlparse.urlsplit(url))
    if rgURL[split.scheme] == '':
        url = r"http://%s" % url
        rgURL = list(urlparse.urlsplit(url))
    
    # Invalid protocol
    if rgURL[split.scheme] != "http" and rgURL[split.scheme] != "https":
        raise reqfilter.Error("Invalid protocol: %s" % rgURL[split.scheme]) 

    if domain_canonical is not None:
        rgURL[split.domain] = domain_canonical
    
    if rgURL[split.domain]:
        rgURL[split.domain] = rgURL[split.domain].lower()
    
    if not rgURL[split.domain] or not regDomain.search(rgURL[split.domain]) or len(rgURL[split.domain]) > 255:
        raise reqfilter.Error("Invalid URL: %s" % urlparse.urlunsplit(rgURL))

    # Always end naked domains with a trailing slash as canonical
    if rgURL[split.path] == '':
        rgURL[split.path] = '/'

    return urlparse.urlunsplit(rgURL)
Esempio n. 19
0
    def urljoin(self, base_url, url):
        if not url:
            return base_url

        urlsp = urlparse.urlsplit(url)
        if urlsp.scheme or not base_url:
            return url

        basesp = urlparse.urlsplit(base_url)
        if basesp.scheme in ("keep", "arvwf"):
            if not basesp.path:
                raise IOError(errno.EINVAL, "Invalid Keep locator", base_url)

            baseparts = basesp.path.split("/")
            urlparts = urlsp.path.split("/") if urlsp.path else []

            pdh = baseparts.pop(0)

            if basesp.scheme == "keep" and not arvados.util.keep_locator_pattern.match(pdh):
                raise IOError(errno.EINVAL, "Invalid Keep locator", base_url)

            if urlsp.path.startswith("/"):
                baseparts = []
                urlparts.pop(0)

            if baseparts and urlsp.path:
                baseparts.pop()

            path = "/".join([pdh] + baseparts + urlparts)
            return urlparse.urlunsplit((basesp.scheme, "", path, "", urlsp.fragment))

        return super(CollectionFetcher, self).urljoin(base_url, url)
Esempio n. 20
0
    def generate_temp_url(self, url=None, path=None, duration=300, method='GET'):

        settings = getUtility(IRegistry).forInterface(ISwiftSettings)
        # storage_url contains version and account
        storage_url = getattr(settings, 'storage_url', os.environ.get('OS_STORAGE_URL'))
        storage_url = urlsplit(storage_url)

        if url:
            url = urlsplit(url)
            if storage_url.netloc.rsplit(':', 1)[0] != url.netloc.rsplit(':', 1)[0]:
                # not our swift store
                return url.geturl()
        elif path:
            # build storage url; path contains container name and starts with /
            url = '{0}://{1}{2}{3}'.format(
                storage_url.scheme, storage_url.netloc,
                storage_url.path, path
            )
            url = urlsplit(url)
        else:
            raise Exception('Need either path or url')
        # build temp_url
        key = getattr(settings, 'temp_url_key', os.environ.get('OS_TEMP_URL_KEY'))
        if not key:
            return url.geturl()
        expires = int(time() + duration)
        hmac_body = u"\n".join((method.upper().encode(),
                               str(expires),
                               url.path))
        sig = hmac.new(key.encode('utf-8'),
                       hmac_body.encode('utf-8'),
                       hashlib.sha1).hexdigest()
        temp_url = u"{url}?temp_url_sig={sig}&temp_url_expires={expires}".format(
            url=url.geturl(), sig=sig, expires=expires)
        return temp_url
Esempio n. 21
0
  def enumerate_links(self, input_url):
    self.__browser.open(input_url)

    chapter_name = self.__browser.title().split(' - ', 1)[0]
    chapter_dir = os.path.join(DEST_PATH, chapter_name)
    try:
      os.mkdir(chapter_dir)
    except OSError:
      pass

    soup = BeautifulSoup.BeautifulSoup(self.__browser.response().read())
    o = urlparse.urlsplit(input_url)
    href_re = re.compile('/'.join(o.path.split('/')[2:-1]))
    links = soup('a', href=href_re)

    o = urlparse.urlsplit(links[1]['href'])
    start_dir_path, start_page = o.path.rsplit('/', 1)
    start_page = int(start_page)
    o = urlparse.urlsplit(links[-3]['href'])
    end_dir_path, end_page = o.path.rsplit('/', 1)
    end_page = int(end_page)
    assert start_dir_path == end_dir_path, 'Start and end dir paths differ'
    dir_path = start_dir_path

    tasks = []
    for page_idx in xrange(start_page, end_page + 1):
      page_url = urlparse.urlunsplit((o[0], o[1],
          '/'.join((dir_path, str(page_idx))), o[3], o[4]))
      tasks.append((page_url, chapter_dir))

    self.zipit(chapter_name, chapter_dir, tasks)

    print '[*] Finished'
Esempio n. 22
0
    def assertRedirects(self, response, expected_url, status_code=302, target_status_code=200, host=None):
        """Asserts that a response redirected to a specific URL, and that the
        redirect URL can be loaded.

        Note that assertRedirects won't work for external links since it uses
        TestClient to do a request.
        """
        self.assertEqual(
            response.status_code,
            status_code,
            (
                "Response didn't redirect as expected: Response code was %d"
                " (expected %d)" % (response.status_code, status_code)
            ),
        )
        url = response["Location"]
        scheme, netloc, path, query, fragment = urlsplit(url)
        e_scheme, e_netloc, e_path, e_query, e_fragment = urlsplit(expected_url)
        if not (e_scheme or e_netloc):
            expected_url = urlunsplit(("http", host or "testserver", e_path, e_query, e_fragment))
        self.assertEqual(url, expected_url, "Response redirected to '%s', expected '%s'" % (url, expected_url))

        # Get the redirection page, using the same client that was used
        # to obtain the original response.
        redirect_response = response.client.get(path, QueryDict(query))
        self.assertEqual(
            redirect_response.status_code,
            target_status_code,
            ("Couldn't retrieve redirection page '%s': response code was %d" " (expected %d)")
            % (path, redirect_response.status_code, target_status_code),
        )
def getsession(url, user, password, domain="default"):

    scheme, location, path, query, fragment = urlparse.urlsplit(url)

    if scheme is None and location is None and query is None:

        url="http://"+url

    if url[-1]!="/": url+="/"

    url+="xmlrpc.php"

    sp=xmlrpclib.ServerProxy(url)

    res=sp.system.login({"username": user, "password": password, "domain": domain})

    if "sessionid" not in res or "kp3" not in res:

        raise Exception("Invalid username or password")

    scheme, location, path, query, fragment = urlparse.urlsplit(url)

    if location.find("@")>=0:

        location=location[location.find("@")+1:]

    newurl=urlparse.urlunsplit( (scheme, "%s:%s@%s" % (res["sessionid"], res["kp3"], location), path, query, fragment) )

    return Session(xmlrpclib.ServerProxy(newurl), res)
Esempio n. 24
0
def get_from_form(str_page, response, opener=None):
    page = lxml.html.document_fromstring(str_page)
    if len(page.forms) == 1: form = page.forms[0]
    else: form = page.forms[1] # для Яндекса на этапе полтверждения прав:(
    # Собираем параметры
    key_value = {}
    for inpt in form.inputs:
        value = inpt.value
        name = inpt.name
        if None not in [name, value]: key_value[name] = value.encode('utf-8')
    #if key_value.has_key(None): del key_value[None] # У кнопки обычно нет имени.
    # Извлекаем адрес отправки формы
    action_url = form.action
    if action_url == None: action_url = response.geturl()

    parts = urlparse.urlsplit(action_url)
    # если относительный адрес...
    if parts.scheme == '' and parts.netloc == '':
        # относительно сервера
        if action_url[0] == '/':
            netloc = urlparse.urlsplit(response.geturl()).netloc
            action_url = 'https://' + netloc + action_url
        # относительно адреса текущей страницы
        else: action_url = response.geturl() +'/'+ action_url
        #print 'action url after parse: ', action_url

    # проверяем наличие капчи (for vk.com only)
    if key_value.has_key('captcha_key'):
        img = form.cssselect('img.captcha_img')[0]
        captcha_url = img.attrib['src']
        captcha_img = opener.open(captcha_url).read()
        dataMngt.write('oauth/logs/captcha.jpg', captcha_img, 'wb')
        captcha_key = raw_input('Input the captcha number:')
        key_value['captcha_key'] = captcha_key
    return key_value, action_url
Esempio n. 25
0
    def clean(self):
        from django.core.validators import URLValidator, validate_ipv46_address

        port_re = "(:[0-9]{1,5}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])"
        cleaned_data = super(AddEndpointForm, self).clean()

        if 'endpoint' in cleaned_data and 'product' in cleaned_data:
            endpoint = cleaned_data['endpoint']
            product = cleaned_data['product']
            if isinstance(product, Product):
                self.product = product
            else:
                self.product = Product.objects.get(id=int(product))
        else:
            raise forms.ValidationError('Please enter a valid URL or IP address.',
                                        code='invalid')

        endpoints = endpoint.split()
        count = 0
        error = False
        for endpoint in endpoints:
            try:
                url_validator = URLValidator()
                url_validator(endpoint)
                protocol, host, path, query, fragment = urlsplit(endpoint)
                self.endpoints_to_process.append([protocol, host, path, query, fragment])
            except forms.ValidationError:
                try:
                    # do we have a port number?
                    host = endpoint
                    regex = re.compile(port_re)
                    if regex.findall(endpoint):
                        for g in regex.findall(endpoint):
                            host = re.sub(port_re, '', host)
                    validate_ipv46_address(host)
                    protocol, host, path, query, fragment = ("", endpoint, "", "", "")
                    self.endpoints_to_process.append([protocol, host, path, query, fragment])
                except forms.ValidationError:
                    try:
                        regex = re.compile(
                            r'^(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'  # domain...
                            r'localhost|'  # localhost...
                            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'  # ...or ipv4
                            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'  # ...or ipv6
                            r'(?::\d+)?'  # optional port
                            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
                        validate_hostname = RegexValidator(regex=regex)
                        validate_hostname(host)
                        protocol, host, path, query, fragment = (None, host, None, None, None)
                        if "/" in host or "?" in host or "#" in host:
                            # add a fake protocol just to join, wont use in update to database
                            host_with_protocol = "http://" + host
                            p, host, path, query, fragment = urlsplit(host_with_protocol)
                        self.endpoints_to_process.append([protocol, host, path, query, fragment])
                    except forms.ValidationError:
                        raise forms.ValidationError(
                            'Please check items entered, one or more do not appear to be a valid URL or IP address.',
                            code='invalid')

        return cleaned_data
Esempio n. 26
0
def spider(client, url, domain_whitelist=None, pool=None, threadpool=None, tested=None):
    client.send_status('Spidering {url}...'.format(url=url))

    domain_whitelist = domain_whitelist or (urlsplit(url).netloc,)
    threadpool = threadpool or ThreadPool(4) # for lxml - 4 workers
    pool = pool or Pool() # maximum number of concurrent HTTP requests
    tested = tested or set([url])

    with timer() as timed:
        response = requests.get(url)

    result = dict(
        status_code = response.status_code,
        length = len(response.text),
        headers = response.headers,
        url = url,
        duration = timed.result(),
    )
    client.send_result(result)

    html = threadpool.apply(fromstring, [response.text])
    for link in html.cssselect('a'):
        href = link.attrib.get('href').split('#')[0].strip()
        if not href:
            continue
        url = urljoin(response.url, href)
        parts = urlsplit(url)
        if parts.netloc not in domain_whitelist:
            continue
        if url in tested:
            continue
        tested.add(url)
        pool.spawn(spider, client, url, domain_whitelist, pool, threadpool, tested)
    return pool
Esempio n. 27
0
    def _resolveLocation(self, requestURI, location):
        from twisted.web.client import _urljoin
        from urlparse import urlparse, urlsplit
        old_url = urlsplit(requestURI)[1].split(":")
        go_to = urlsplit(location)[1].split(":")

        if self._onRedirect == "sticky":
            location = location.replace(go_to[0], old_url[0])
        elif self._onRedirect == "stickyport":
            def _preparePort(url):
                urlsplited = urlsplit(url)[1].split(":")
                scheme = urlsplit(url).scheme \
                    if urlsplit(url).scheme else "http"
                if scheme == "http":
                    url = url.replace(urlsplited[0], urlsplited[0]+":80")
                elif scheme == "https":
                    url = url.replace(urlsplited[0], urlsplited[0]+":443")
                return url

            if len(old_url) != 2:
                requestURI = _preparePort(requestURI)
                old_url = urlsplit(requestURI)[1].split(":")
            if len(go_to) != 2:
                location = _preparePort(location)
                go_to = urlsplit(location)[1].split(":")
            if not self._proxy:
                location = location.replace(go_to[1], str(self._port))
            else:
                location = location.replace(go_to[1], old_url[1])
        location = _urljoin(requestURI, location)
        log.debug("Locating to URL: %s" % location)
        return location
Esempio n. 28
0
def get_correctedFiles(path, save, url, img):

    if not os.path.exists(save):
        os.makedirs(save)

    for f in os.listdir(path):
        print "correcting file %s" % f
        infile = open(os.path.join(path, f)).read()
        
        soup = BeautifulSoup(infile, "html5lib")
        for tag in soup.find_all(lambda t: 'href' in t.attrs or 'src' in t.attrs):
            if 'href' in tag.attrs:
                url_parts = urlparse.urlsplit(tag.attrs["href"])
                full_path = tag.attrs["href"]
                hrefpath = url_parts.path
                if full_path[0:4] != "http" or full_path[0:5] != " http":
                    # for wiki conversion (moin moin wikis)
                    # hrefpath = hrefpath.replace("/", "|")
                    if hrefpath[0:6] == "|wiki|":
                        hrefpath = hrefpath[6:]
                    tag.attrs["href"] = urlparse.urljoin(url, hrefpath)
            else:
                url_parts = urlparse.urlsplit(tag.attrs["src"])
                srcpath = url_parts.path
                srcparts = srcpath.split("/")
                srcpath = srcparts[len(srcparts) -1]
                tag.attrs["src"] = urlparse.urljoin(img, srcpath)

        
        outfile = open(os.path.join(save, f), "w")
        outfile.write(soup.encode("ascii", "xmlcharrefreplace"))
        outfile.close()
    def filename(self, pdf_url0):
        pdf_url = str(pdf_url0)

        CurrentDir=os.path.dirname(os.path.realpath(__file__)).replace('\\','/')
        if re.findall('/', pdf_url):
            self.suffix = os.path.splitext(pdf_url)[1]
            self.file_name_decode = urllib2.unquote(pdf_url).decode('utf8').split('/')[-1]
            self.filename = urlparse.urlsplit(pdf_url).path.split('/')[-1]
            if self.filename.endswith('.jsp'):
                self.filename=(self.suffix).split('arnumber=')[1]+'.pdf'

            # self.filename=(pdf_url).split('id=')[1].split('&')[0]+'.pdf'
            # self.pdf_Folder_filename = CurrentDir + "/"+self.PDF_Files_Dir+"/" + self.filename
            # self.W_pdf_Folder_filename = CurrentDir + "/"+self.Watermarked_PDF_Dir+"/" + self.filename
            self.pdf_Folder_filename = self.PDF_Files_Dir+"/" + self.filename
            self.W_pdf_Folder_filename =self.Watermarked_PDF_Dir+"/" + self.filename
            self.chdir=CurrentDir
        else:
            self.filename = urlparse.urlsplit(pdf_url).path.split('\\')[-1]
            self.chdir=CurrentDir
            # self.pdf_Folder_filename = CurrentDir+ "/"+self.PDF_Files_Dir+"/" + self.filename
            # self.W_pdf_Folder_filename = CurrentDir + "/"+self.Watermarked_PDF_Dir+"/" + self.filename
            self.pdf_Folder_filename =self.PDF_Files_Dir+"/" + self.filename
            self.W_pdf_Folder_filename =self.Watermarked_PDF_Dir+"/" + self.filename


        return self
Esempio n. 30
0
 def build_url (self):
     """
     Construct self.url and self.urlparts out of the given base
     url information self.base_url, self.parent_url and self.base_ref.
     """
     # norm base url - can raise UnicodeError from url.idna_encode()
     base_url, is_idn = url_norm(self.base_url, self.encoding)
     # make url absolute
     if self.base_ref:
         # use base reference as parent url
         if ":" not in self.base_ref:
             # some websites have a relative base reference
             self.base_ref = urljoin(self.parent_url, self.base_ref)
         self.url = urljoin(self.base_ref, base_url)
     elif self.parent_url:
         # strip the parent url query and anchor
         urlparts = list(urlparse.urlsplit(self.parent_url))
         urlparts[4] = ""
         parent_url = urlutil.urlunsplit(urlparts)
         self.url = urljoin(parent_url, base_url)
     else:
         self.url = base_url
     # urljoin can unnorm the url path, so norm it again
     urlparts = list(urlparse.urlsplit(self.url))
     if urlparts[2]:
         urlparts[2] = urlutil.collapse_segments(urlparts[2])
     self.url = urlutil.urlunsplit(urlparts)
     # split into (modifiable) list
     self.urlparts = strformat.url_unicode_split(self.url)
     # and unsplit again
     self.url = urlutil.urlunsplit(self.urlparts)
     self.build_url_parts()
Esempio n. 31
0
def process_url(raw_url):
 if ' ' not in raw_url[-1]:
     raw_url=raw_url.replace(' ','%20')
     return raw_url
 elif ' ' in raw_url[-1]:
     raw_url=raw_url[:-1]
     raw_url=raw_url.replace(' ','%20')
     return raw_url
 
url='' ## give the url here
parse_object=urlparse(url)
dirname=basename(parse_object.path)
if not os.path.exists('images'):
    os.mkdir("images")
os.mkdir("images/"+dirname)
os.chdir("images/"+dirname)
 
urlcontent=urllib2.urlopen(url).read()
imgurls=re.findall('img .*?src="(.*?)"',urlcontent)
for imgurl in imgurls:
 try:
     imgurl=process_url(imgurl)
     imgdata=urllib2.urlopen(imgurl).read()
     filname=basename(urlsplit(imgurl)[2])
     output=open(filname,'wb')
     output.write(imgdata)
     output.close()
     os.remove(filename)
 except:
     pass
    # when proxy needed, make urllib2 follow it
    proxy = repo.proxy
    proxy_username = repo.proxy_username
    proxy_password = repo.proxy_password

    if not proxy:
        proxy = get_proxy_for(repo.baseurl[0])

    handlers = []
    auth_handler = u2.HTTPBasicAuthHandler(
        u2.HTTPPasswordMgrWithDefaultRealm())
    u2opener = None
    if proxy:
        if proxy_username:
            proxy_netloc = urlparse.urlsplit(proxy).netloc
            if proxy_password:
                proxy_url = 'http://%s:%s@%s' % (proxy_username,
                                                 proxy_password, proxy_netloc)
            else:
                proxy_url = 'http://%s@%s' % (proxy_username, proxy_netloc)
        else:
            proxy_url = proxy

        proxy_support = u2.ProxyHandler({
            'http': proxy_url,
            'https': proxy_url,
            'ftp': proxy_url
        })
        handlers.append(proxy_support)
Esempio n. 33
0
status = 302
redirectCount = 0

url = service
if gnu:
    url = url + '?out=gnu'
else:
    url = url + '?out=text'

if errorsOnly:
    url = url + '&level=error'

while (status == 302 or status == 301 or status == 307) and redirectCount < 10:
    if redirectCount > 0:
        url = response.getheader('Location')
    parsed = urlparse.urlsplit(url)
    if parsed[0] != 'http':
        sys.stderr.write('URI scheme %s not supported.\n' % parsed[0])
        sys.exit(7)
    if redirectCount > 0:
        connection.close()  # previous connection
        print 'Redirecting to %s' % url
        print 'Please press enter to continue or type "stop" followed by enter to stop.'
        if raw_input() != "":
            sys.exit(0)
    connection = httplib.HTTPConnection(parsed[1])
    connection.connect()
    connection.putrequest("POST",
                          "%s?%s" % (parsed[2], parsed[3]),
                          skip_accept_encoding=1)
    connection.putheader("Accept-Encoding", 'gzip')
Esempio n. 34
0
 def get_prefix_source(cls):
     """Return the prefix source, by default derived from site."""
     if hasattr(cls, '_prefix_source'):
         return cls._prefix_source
     else:
         return urlparse.urlsplit(cls.site)[2]
Esempio n. 35
0
File: views.py Progetto: Kewtt/hue
    try:
        attempt_index = int(attempt_index)
        attempt = job.job_attempts['jobAttempt'][attempt_index]
        log_link = attempt['logsLink']
    except (KeyError, RestException), e:
        raise KeyError(
            _("Cannot find job attempt '%(id)s'.") % {'id': job.jobId}, e)

    link = '/%s/' % name
    params = {}
    if offset and int(offset) >= 0:
        params['start'] = offset

    root = Resource(get_log_client(log_link),
                    urlparse.urlsplit(log_link)[2],
                    urlencode=False)

    try:
        response = root.get(link, params=params)
        log = html.fromstring(response).xpath(
            '/html/body/table/tbody/tr/td[2]')[0].text_content()
    except Exception, e:
        log = _('Failed to retrieve log: %s') % e

    response = {'log': log}

    return HttpResponse(json.dumps(response), mimetype="application/json")


@check_job_permission
Esempio n. 36
0
def update_scm_url(scm_type,
                   url,
                   username=True,
                   password=True,
                   check_special_cases=True,
                   scp_format=False):
    '''
    Update the given SCM URL to add/replace/remove the username/password. When
    username/password is True, preserve existing username/password, when
    False (None, '', etc.), remove any existing username/password, otherwise
    replace username/password. Also validates the given URL.
    '''
    # Handle all of the URL formats supported by the SCM systems:
    # git: https://www.kernel.org/pub/software/scm/git/docs/git-clone.html#URLS
    # hg: http://www.selenic.com/mercurial/hg.1.html#url-paths
    # svn: http://svnbook.red-bean.com/en/1.7/svn-book.html#svn.advanced.reposurls
    if scm_type not in ('git', 'hg', 'svn', 'insights'):
        raise ValueError(_('Unsupported SCM type "%s"') % str(scm_type))
    if not url.strip():
        return ''
    parts = urlparse.urlsplit(url)
    try:
        parts.port
    except ValueError:
        raise ValueError(_('Invalid %s URL') % scm_type)
    if parts.scheme == 'git+ssh' and not scp_format:
        raise ValueError(_('Unsupported %s URL') % scm_type)

    if '://' not in url:
        # Handle SCP-style URLs for git (e.g. [user@]host.xz:path/to/repo.git/).
        if scm_type == 'git' and ':' in url:
            if '@' in url:
                userpass, hostpath = url.split('@', 1)
            else:
                userpass, hostpath = '', url
            if hostpath.count(':') > 1:
                raise ValueError(_('Invalid %s URL') % scm_type)
            host, path = hostpath.split(':', 1)
            #if not path.startswith('/') and not path.startswith('~/'):
            #    path = '~/%s' % path
            #if path.startswith('/'):
            #    path = path.lstrip('/')
            hostpath = '/'.join([host, path])
            modified_url = '@'.join(filter(None, [userpass, hostpath]))
            # git+ssh scheme identifies URLs that should be converted back to
            # SCP style before passed to git module.
            parts = urlparse.urlsplit('git+ssh://%s' % modified_url)
        # Handle local paths specified without file scheme (e.g. /path/to/foo).
        # Only supported by git and hg.
        elif scm_type in ('git', 'hg'):
            if not url.startswith('/'):
                parts = urlparse.urlsplit('file:///%s' % url)
            else:
                parts = urlparse.urlsplit('file://%s' % url)
        else:
            raise ValueError(_('Invalid %s URL') % scm_type)

    # Validate that scheme is valid for given scm_type.
    scm_type_schemes = {
        'git':
        ('ssh', 'git', 'git+ssh', 'http', 'https', 'ftp', 'ftps', 'file'),
        'hg': ('http', 'https', 'ssh', 'file'),
        'svn': ('http', 'https', 'svn', 'svn+ssh', 'file'),
        'insights': ('http', 'https')
    }
    if parts.scheme not in scm_type_schemes.get(scm_type, ()):
        raise ValueError(_('Unsupported %s URL') % scm_type)
    if parts.scheme == 'file' and parts.netloc not in ('', 'localhost'):
        raise ValueError(
            _('Unsupported host "%s" for file:// URL') % (parts.netloc))
    elif parts.scheme != 'file' and not parts.netloc:
        raise ValueError(_('Host is required for %s URL') % parts.scheme)
    if username is True:
        netloc_username = parts.username or ''
    elif username:
        netloc_username = username
    else:
        netloc_username = ''
    if password is True:
        netloc_password = parts.password or ''
    elif password:
        netloc_password = password
    else:
        netloc_password = ''

    # Special handling for github/bitbucket SSH URLs.
    if check_special_cases:
        special_git_hosts = ('github.com', 'bitbucket.org',
                             'altssh.bitbucket.org')
        if scm_type == 'git' and parts.scheme.endswith(
                'ssh'
        ) and parts.hostname in special_git_hosts and netloc_username != 'git':
            raise ValueError(
                _('Username must be "git" for SSH access to %s.') %
                parts.hostname)
        if scm_type == 'git' and parts.scheme.endswith(
                'ssh'
        ) and parts.hostname in special_git_hosts and netloc_password:
            #raise ValueError('Password not allowed for SSH access to %s.' % parts.hostname)
            netloc_password = ''
        special_hg_hosts = ('bitbucket.org', 'altssh.bitbucket.org')
        if scm_type == 'hg' and parts.scheme == 'ssh' and parts.hostname in special_hg_hosts and netloc_username != 'hg':
            raise ValueError(
                _('Username must be "hg" for SSH access to %s.') %
                parts.hostname)
        if scm_type == 'hg' and parts.scheme == 'ssh' and netloc_password:
            #raise ValueError('Password not supported for SSH with Mercurial.')
            netloc_password = ''

    if netloc_username and parts.scheme != 'file' and scm_type != "insights":
        netloc = u':'.join([
            urllib.quote(x, safe='')
            for x in (netloc_username, netloc_password) if x
        ])
    else:
        netloc = u''
    netloc = u'@'.join(filter(None, [netloc, parts.hostname]))
    if parts.port:
        netloc = u':'.join([netloc, unicode(parts.port)])
    new_url = urlparse.urlunsplit(
        [parts.scheme, netloc, parts.path, parts.query, parts.fragment])
    if scp_format and parts.scheme == 'git+ssh':
        new_url = new_url.replace('git+ssh://', '', 1).replace('/', ':', 1)
    return new_url
Esempio n. 37
0
    def __fetch_url(self, params):
        # Skip existing file if exists and matches checksum
        if not self.parent.force:
            if self.__is_file_done(local_path=params['target_file'],
                                   checksum_type=params['checksum_type'],
                                   checksum=params['checksum']):
                return True

        opts = URLGrabberOptions(ssl_ca_cert=params['ssl_ca_cert'],
                                 ssl_cert=params['ssl_client_cert'],
                                 ssl_key=params['ssl_client_key'],
                                 range=params['bytes_range'],
                                 proxy=params['proxy'],
                                 username=params['proxy_username'],
                                 password=params['proxy_password'],
                                 proxies=params['proxies'],
                                 http_headers=tuple(
                                     params['http_headers'].items()))
        mirrors = len(params['urls'])
        for retry in range(max(self.parent.retries, mirrors)):
            fo = None
            url = urlparse.urljoin(params['urls'][self.mirror],
                                   params['relative_path'])
            ## BEWARE: This hack is introduced in order to support SUSE SCC channels
            ## This also needs a patched urlgrabber AFAIK
            if 'authtoken' in params and params['authtoken']:
                (scheme, netloc, path, query,
                 _) = urlparse.urlsplit(params['urls'][self.mirror])
                url = "%s://%s%s/%s?%s" % (scheme, netloc, path,
                                           params['relative_path'],
                                           query.rstrip('/'))
            try:
                try:
                    fo = PyCurlFileObjectThread(url, params['target_file'],
                                                opts, self.curl, self.parent)
                    # Check target file
                    if not self.__is_file_done(
                            file_obj=fo,
                            checksum_type=params['checksum_type'],
                            checksum=params['checksum']):
                        raise FailedDownloadError(
                            "Target file isn't valid. Checksum should be %s (%s)."
                            % (params['checksum'], params['checksum_type']))
                    break
                except (FailedDownloadError, URLGrabError):
                    e = sys.exc_info()[1]
                    # urlgrabber-3.10.1-9 trows URLGrabError for both
                    # 'HTTP Error 404 - Not Found' and 'No space left on device', so
                    # workaround for this is check error message:
                    if 'No space left on device' in str(e):
                        self.parent.fail_download(e)
                        return False

                    if not self.__can_retry(retry, mirrors, opts, url, e):
                        return False
                    self.__next_mirror(mirrors)
                # RHEL 6 urlgrabber raises KeyboardInterrupt for example when there is no space left
                # but handle also other fatal exceptions
                except (KeyboardInterrupt, Exception):  # pylint: disable=W0703
                    e = sys.exc_info()[1]
                    self.parent.fail_download(e)
                    return False
            finally:
                if fo:
                    fo.close()
                # Delete failed download file
                elif os.path.isfile(params['target_file']):
                    os.unlink(params['target_file'])

        return True
Esempio n. 38
0
def load_tool(argsworkflow,
              updateonly,
              strict,
              makeTool,
              debug,
              print_pre=False,
              print_rdf=False,
              print_dot=False,
              print_deps=False,
              relative_deps=False,
              rdf_serializer=None,
              stdout=sys.stdout,
              urifrag=None):
    # type: (Union[str,unicode,dict[unicode,Any]], bool, bool, Callable[...,Process], bool, bool, bool, bool, bool, bool, Any, Any, Any) -> Any
    (document_loader, avsc_names, schema_metadata) = process.get_schema()

    if isinstance(avsc_names, Exception):
        raise avsc_names

    jobobj = None
    uri = None  # type: str
    workflowobj = None  # type: Dict[unicode, Any]
    if isinstance(argsworkflow, (basestring)):
        split = urlparse.urlsplit(cast(str, argsworkflow))
        if split.scheme:
            uri = cast(str, argsworkflow)
        else:
            uri = "file://" + os.path.abspath(cast(str, argsworkflow))
        fileuri, urifrag = urlparse.urldefrag(uri)
        workflowobj = document_loader.fetch(fileuri)
    elif isinstance(argsworkflow, dict):
        workflowobj = argsworkflow
        uri = urifrag
        fileuri = "#"
    else:
        raise schema_salad.validate.ValidationException("Must be URI or dict")

    if "cwl:tool" in workflowobj:
        jobobj = workflowobj
        uri = urlparse.urljoin(uri, jobobj["cwl:tool"])
        fileuri, urifrag = urlparse.urldefrag(uri)
        workflowobj = document_loader.fetch(fileuri)
        del jobobj["cwl:tool"]

    if isinstance(workflowobj, list):
        # bare list without a version must be treated as draft-2
        workflowobj = {
            "cwlVersion": "https://w3id.org/cwl/cwl#draft-2",
            "id": fileuri,
            "@graph": workflowobj
        }

    workflowobj = update.update(workflowobj, document_loader, fileuri)
    document_loader.idx.clear()

    if updateonly:
        stdout.write(json.dumps(workflowobj, indent=4))
        return 0

    if print_deps:
        printdeps(workflowobj, document_loader, stdout, relative_deps)
        return 0

    try:
        processobj, metadata = schema_salad.schema.load_and_validate(
            document_loader, avsc_names, workflowobj, strict)
    except (schema_salad.validate.ValidationException, RuntimeError) as e:
        _logger.error(u"Tool definition failed validation:\n%s",
                      e,
                      exc_info=(e if debug else False))
        return 1

    if print_pre:
        stdout.write(json.dumps(processobj, indent=4))
        return 0

    if print_rdf:
        printrdf(str(argsworkflow), processobj, document_loader.ctx,
                 rdf_serializer, stdout)
        return 0

    if print_dot:
        printdot(str(argsworkflow), processobj, document_loader.ctx, stdout)
        return 0

    if urifrag:
        processobj, _ = document_loader.resolve_ref(uri)
    elif isinstance(processobj, list):
        if 1 == len(processobj):
            processobj = processobj[0]
        else:
            _logger.error(
                u"Tool file contains graph of multiple objects, must specify one of #%s",
                ", #".join(
                    urlparse.urldefrag(i["id"])[1] for i in processobj
                    if "id" in i))
            return 1

    try:
        t = makeTool(processobj,
                     strict=strict,
                     makeTool=makeTool,
                     loader=document_loader,
                     avsc_names=avsc_names)
    except (schema_salad.validate.ValidationException) as e:
        _logger.error(u"Tool definition failed validation:\n%s",
                      e,
                      exc_info=(e if debug else False))
        return 1
    except (RuntimeError, workflow.WorkflowException) as e:
        _logger.error(u"Tool definition failed initialization:\n%s",
                      e,
                      exc_info=(e if debug else False))
        return 1

    if jobobj:
        for inp in t.tool["inputs"]:
            if shortname(inp["id"]) in jobobj:
                inp["default"] = jobobj[shortname(inp["id"])]

    if metadata:
        t.metadata = metadata
    else:
        t.metadata = {
            "$namespaces": t.tool.get("$namespaces", {}),
            "$schemas": t.tool.get("$schemas", [])
        }

    return t
 def check_file(self, fn):  # type: (unicode) -> bool
     if fn.startswith("file://"):
         u = urlparse.urlsplit(fn)
         return os.path.exists(u.path)
     else:
         return False
Esempio n. 40
0
def _curl_setup_request(curl, request, buffer, headers):
    curl.setopt(pycurl.URL, str(request.url)) # xxx: cannot send in unicode strings?

    # libcurl's magic "Expect: 100-continue" behavior causes delays
    # with servers that don't support it (which include, among others,
    # Google's OpenID endpoint).  Additionally, this behavior has
    # a bug in conjunction with the curl_multi_socket_action API
    # (https://sourceforge.net/tracker/?func=detail&atid=100976&aid=3039744&group_id=976),
    # which increases the delays.  It's more trouble than it's worth,
    # so just turn off the feature (yes, setting Expect: to an empty
    # value is the official way to disable this)
    if "Expect" not in request.headers:
        request.headers["Expect"] = ""

    # libcurl adds Pragma: no-cache by default; disable that too
    if "Pragma" not in request.headers:
        request.headers["Pragma"] = ""

    # Request headers may be either a regular dict or HTTPHeaders object
    if isinstance(request.headers, httputil.HTTPHeaders):
        curl.setopt(pycurl.HTTPHEADER,
                    [utf8("%s: %s" % i) for i in request.headers.get_all()])
    else:
        curl.setopt(pycurl.HTTPHEADER,
                    [utf8("%s: %s" % i) for i in request.headers.iteritems()])

    if request.header_callback:
        curl.setopt(pycurl.HEADERFUNCTION, request.header_callback)
    else:
        curl.setopt(pycurl.HEADERFUNCTION,
                    lambda line: _curl_header_callback(headers, line))
    if request.streaming_callback:
        curl.setopt(pycurl.WRITEFUNCTION, request.streaming_callback)
    else:
        curl.setopt(pycurl.WRITEFUNCTION, buffer.write)
    if urlparse.urlsplit(_unicode(request.url)).scheme == 'https' and not request.validate_cert:
        curl.setopt(pycurl.SSL_VERIFYPEER, 0)
        curl.setopt(pycurl.SSL_VERIFYHOST, 0)
    curl.setopt(pycurl.FOLLOWLOCATION, request.follow_redirects)
    curl.setopt(pycurl.MAXREDIRS, request.max_redirects)
    curl.setopt(pycurl.CONNECTTIMEOUT, int(request.connect_timeout))
    curl.setopt(pycurl.TIMEOUT, int(request.request_timeout))
    if request.user_agent:
        curl.setopt(pycurl.USERAGENT, utf8(request.user_agent))
    else:
        curl.setopt(pycurl.USERAGENT, "Mozilla/5.0 (compatible; pycurl)")
    if request.network_interface:
        curl.setopt(pycurl.INTERFACE, request.network_interface)
    if request.use_gzip:
        curl.setopt(pycurl.ENCODING, "gzip,deflate")
    else:
        curl.setopt(pycurl.ENCODING, "none")
    if request.proxy_host and request.proxy_port:
        curl.setopt(pycurl.PROXY, request.proxy_host)
        curl.setopt(pycurl.PROXYPORT, request.proxy_port)
        if request.proxy_username:
            credentials = '%s:%s' % (request.proxy_username,
                    request.proxy_password)
            curl.setopt(pycurl.PROXYUSERPWD, credentials)
    else:
        curl.setopt(pycurl.PROXY, '')
    if request.validate_cert:
        curl.setopt(pycurl.SSL_VERIFYPEER, 1)
        curl.setopt(pycurl.SSL_VERIFYHOST, 2)
    else:
        curl.setopt(pycurl.SSL_VERIFYPEER, 0)
        curl.setopt(pycurl.SSL_VERIFYHOST, 0)
    if request.ca_certs is not None:
        curl.setopt(pycurl.CAINFO, request.ca_certs)
    else:
        # There is no way to restore pycurl.CAINFO to its default value
        # (Using unsetopt makes it reject all certificates).
        # I don't see any way to read the default value from python so it
        # can be restored later.  We'll have to just leave CAINFO untouched
        # if no ca_certs file was specified, and require that if any
        # request uses a custom ca_certs file, they all must.
        pass

    if request.allow_ipv6 is False:
        # Curl behaves reasonably when DNS resolution gives an ipv6 address
        # that we can't reach, so allow ipv6 unless the user asks to disable.
        # (but see version check in _process_queue above)
        curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)

    # Set the request method through curl's retarded interface which makes
    # up names for almost every single method
    curl_options = {
        "GET": pycurl.HTTPGET,
        "POST": pycurl.POST,
        "PUT": pycurl.UPLOAD,
        "HEAD": pycurl.NOBODY,
    }
    custom_methods = set(["DELETE"])
    for o in curl_options.values():
        curl.setopt(o, False)
    if request.method in curl_options:
        curl.unsetopt(pycurl.CUSTOMREQUEST)
        curl.setopt(curl_options[request.method], True)
    elif request.allow_nonstandard_methods or request.method in custom_methods:
        curl.setopt(pycurl.CUSTOMREQUEST, request.method)
    else:
        raise KeyError('unknown method ' + request.method)

    # Handle curl's cryptic options for every individual HTTP method
    if request.method in ("POST", "PUT"):
        request_buffer =  cStringIO.StringIO(utf8(request.body))
        curl.setopt(pycurl.READFUNCTION, request_buffer.read)
        if request.method == "POST":
            def ioctl(cmd):
                if cmd == curl.IOCMD_RESTARTREAD:
                    request_buffer.seek(0)
            curl.setopt(pycurl.IOCTLFUNCTION, ioctl)
            curl.setopt(pycurl.POSTFIELDSIZE, len(request.body))
        else:
            curl.setopt(pycurl.INFILESIZE, len(request.body))

    logmethod = 'info' if request.log_request else 'debug'

    if request.auth_username and request.auth_password:
        userpwd = "%s:%s" % (request.auth_username, request.auth_password)
        curl.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_BASIC)
        curl.setopt(pycurl.USERPWD, userpwd)
        getattr(logging,logmethod)("%s %s (username: %r)", request.method, request.url,
                      request.auth_username)
    else:
        curl.unsetopt(pycurl.USERPWD)
        getattr(logging,logmethod)("%s %s", request.method, request.url)

    if request.client_key is not None or request.client_cert is not None:
        raise ValueError("Client certificate not supported with curl_httpclient")

    if threading.activeCount() > 1:
        # libcurl/pycurl is not thread-safe by default.  When multiple threads
        # are used, signals should be disabled.  This has the side effect
        # of disabling DNS timeouts in some environments (when libcurl is
        # not linked against ares), so we don't do it when there is only one
        # thread.  Applications that use many short-lived threads may need
        # to set NOSIGNAL manually in a prepare_curl_callback since
        # there may not be any other threads running at the time we call
        # threading.activeCount.
        curl.setopt(pycurl.NOSIGNAL, 1)
    if request.prepare_curl_callback is not None:
        request.prepare_curl_callback(curl)
Esempio n. 41
0
 def url(self, url):
     if urlparse.urlsplit(url).netloc is None:
         return self.url(url)
     body = {"url": url}
     return self.send_command("POST", "url", body)
Esempio n. 42
0
def getCookies(cookieName, url):
    domain = urlparse.urlsplit(url).netloc
    return common.getCookies(cookieName, domain)
Esempio n. 43
0
    def decode(self, content, url):
        result = content
        if not ALWAYS_CHAR_DETECT and self.encoding:  # 先使用上次的编码打开文件尝试
            try:
                result = content.decode(self.encoding)
            except UnicodeDecodeError:  # 解码错误,使用自动检测编码
                encoding = chardet.detect(content)['encoding']
                try:
                    result = content.decode(encoding)
                except UnicodeDecodeError:  # 还是出错,则不转换,直接返回
                    self.encoding = None
                    result = content
                else:  # 保存下次使用,以节省时间
                    self.encoding = encoding
                    #同时保存到数据库
                    netloc = urlparse.urlsplit(url)[1]
                    urlenc = UrlEncoding.all().filter('netloc = ',
                                                      netloc).get()
                    if urlenc:
                        enc = urlenc.feedenc if self.isfeed else urlenc.pageenc
                        if enc != encoding:
                            if self.isfeed:
                                urlenc.feedenc = encoding
                            else:
                                urlenc.pageenc = encoding
                            urlenc.put()
                    elif self.isfeed:
                        UrlEncoding(netloc=netloc, feedenc=encoding).put()
                    else:
                        UrlEncoding(netloc=netloc, pageenc=encoding).put()
        else:  # 暂时没有之前的编码信息
            netloc = urlparse.urlsplit(url)[1]
            urlenc = UrlEncoding.all().filter('netloc = ', netloc).get()
            if urlenc:  #先看数据库有没有
                enc = urlenc.feedenc if self.isfeed else urlenc.pageenc
                if enc:
                    try:
                        result = content.decode(enc)
                    except UnicodeDecodeError:  # 出错,重新探测编码
                        self.encoding = chardet.detect(content)['encoding']
                    else:
                        self.encoding = enc
                        return result
                else:  #数据库暂时没有数据
                    self.encoding = chardet.detect(content)['encoding']
            else:
                self.encoding = chardet.detect(content)['encoding']

            #使用探测到的编码解压
            try:
                result = content.decode(self.encoding)
            except UnicodeDecodeError:  # 出错,则不转换,直接返回
                result = content
            else:
                #保存到数据库
                newurlenc = urlenc if urlenc else UrlEncoding(netloc=netloc)
                if self.isfeed:
                    newurlenc.feedenc = self.encoding
                else:
                    newurlenc.pageenc = self.encoding
                newurlenc.put()
        return result
Esempio n. 44
0
def finish_conversation(self):

    if not (check_duplicate_url(self.host, self.uri)):
        if check_duplicate_uri(self.uri):
            self.uri = create_next_uri(self.uri)

        obj_num = len(conversations)
        conversations.append(namedtuple('Conv',
            ['id','server_ip', 'uri','req_head','res_body','res_head','res_num','res_type','host','referer', \
            'filename','method','redirect_to','req_microsec', 'res_len','magic_name', 'magic_ext']))

        host_tuple = (self.host, str(self.remote_host[0]) + ":" +
                      str(self.remote_host[1]))
        # hosts list
        if (hosts.has_key(host_tuple)):
            hosts[host_tuple].append(self.uri + "   [" + str(obj_num) + "]")
        else:
            hosts[host_tuple] = [self.uri + "   [" + str(obj_num) + "]"]

        # convs list
        conversations[obj_num].id = obj_num
        conversations[obj_num].server_ip = str(
            self.remote_host[0]) + ":" + str(self.remote_host[1])
        conversations[obj_num].uri = self.uri
        conversations[obj_num].redirect_to = self.redirect_to
        conversations[obj_num].short_uri = getShortURI(self.uri)
        conversations[obj_num].req_head = self.req_head
        conversations[obj_num].res_body = self.res_body
        add_object("body", self.res_body)

        try:
            # FindMagic
            mgc_name = ""
            mgc_ext = ""
            mgc_name, mgc_ext = WhatypeMagic.identify_buffer(self.res_body)
        except:
            pass

        conversations[obj_num].magic_name = mgc_name.rstrip()
        conversations[obj_num].magic_ext = mgc_ext.rstrip()

        conversations[obj_num].orig_chunked_resp = self.orig_chunked_resp
        conversations[obj_num].orig_resp = self.orig_resp
        conversations[obj_num].res_head = self.res_head
        conversations[obj_num].res_num = self.res_num

        if ";" in self.res_type:
            conversations[obj_num].res_type = self.res_type[:self.res_type.
                                                            find(";")]
        else:
            conversations[obj_num].res_type = self.res_type

        conversations[obj_num].host = self.host
        conversations[obj_num].referer = self.referer
        conversations[obj_num].filename = self.filename
        conversations[obj_num].method = self.method
        conversations[obj_num].req_microsec = str(self.time)[:10]

        # In case no filename was given from the server, split by URI
        if (conversations[obj_num].filename == ""):
            uri_name = urlparse.urlsplit(str(conversations[obj_num].uri)).path
            conversations[obj_num].filename = uri_name.split('/')[-1]

            if (str(conversations[obj_num].filename).find('?') > 0):
                conversations[obj_num].filename = \
                    conversations[obj_num].filename[:str(conversations[obj_num].filename).find('?')]

            if (str(conversations[obj_num].filename).find('&') > 0):
                conversations[obj_num].filename = \
                    conversations[obj_num].filename[:str(conversations[obj_num].filename).find('&')]

        # In case the URI was '/' then this is still empty
        if (conversations[obj_num].filename == ""):
            conversations[obj_num].filename = str(obj_num) + ".html"

        objects[obj_num].name = conversations[obj_num].filename
        conversations[obj_num].res_len = self.res_len
Esempio n. 45
0
def get_domain(task_id):
    task = Task.objects.get(id=task_id)
    _ = urlparse.urlsplit(task.start_url)
    domain = "%s://%s%s" % (_.scheme, _.netloc, task.base)
    return domain
Esempio n. 46
0
def geoserver_proxy(request,
                    proxy_path,
                    downstream_path,
                    workspace=None,
                    layername=None):
    """
    WARNING: Decorators are applied in the order they appear in the source.
    """
    # AF: No need to authenticate first. We will check if "access_token" is present
    # or not on session

    # @dismissed
    # if not request.user.is_authenticated():
    #     return HttpResponse(
    #         "You must be logged in to access GeoServer",
    #         content_type="text/plain",
    #         status=401)

    def strip_prefix(path, prefix):
        assert prefix in path
        prefix_idx = path.index(prefix)
        _prefix = path[:prefix_idx] + prefix
        full_prefix = "%s/%s/%s" % (
            _prefix, layername, downstream_path) if layername else _prefix
        return path[len(full_prefix):]

    path = strip_prefix(request.get_full_path(), proxy_path)

    raw_url = str(
        "".join([ogc_server_settings.LOCATION, downstream_path, path]))

    if settings.DEFAULT_WORKSPACE or workspace:
        ws = (workspace or settings.DEFAULT_WORKSPACE)
        if ws and ws in path:
            # Strip out WS from PATH
            try:
                path = "/%s" % strip_prefix(path, "/%s:" % (ws))
            except BaseException:
                pass

        if proxy_path == '/gs/%s' % settings.DEFAULT_WORKSPACE and layername:
            import posixpath
            raw_url = urljoin(ogc_server_settings.LOCATION,
                              posixpath.join(workspace, layername, downstream_path, path))

        if downstream_path in ('rest/styles') and len(request.body) > 0:
            if ws:
                # Lets try
                # http://localhost:8080/geoserver/rest/workspaces/<ws>/styles/<style>.xml
                _url = str("".join([ogc_server_settings.LOCATION,
                                    'rest/workspaces/', ws, '/styles',
                                    path]))
            else:
                _url = str("".join([ogc_server_settings.LOCATION,
                                    'rest/styles',
                                    path]))
            raw_url = _url

    if downstream_path in 'ows' and (
        'rest' in path or
            re.match(r'/(w.*s).*$', path, re.IGNORECASE) or
            re.match(r'/(ows).*$', path, re.IGNORECASE)):
        _url = str("".join([ogc_server_settings.LOCATION, '', path[1:]]))
        raw_url = _url
    url = urlsplit(raw_url)
    affected_layers = None

    if '%s/layers' % ws in path:
        downstream_path = 'rest/layers'
    elif '%s/styles' % ws in path:
        downstream_path = 'rest/styles'

    if request.method in ("POST", "PUT", "DELETE"):
        if downstream_path in ('rest/styles', 'rest/layers',
                               'rest/workspaces'):
            if not style_change_check(request, downstream_path):
                return HttpResponse(
                    _(
                        "You don't have permissions to change style for this layer"),
                    content_type="text/plain",
                    status=401)
            elif downstream_path == 'rest/styles':
                logger.info(
                    "[geoserver_proxy] Updating Style ---> url %s" %
                    url.geturl())
                affected_layers = style_update(request, raw_url)
            elif downstream_path == 'rest/layers':
                logger.debug(
                    "[geoserver_proxy] Updating Layer ---> url %s" %
                    url.geturl())
                try:
                    _layer_name = os.path.splitext(os.path.basename(request.path))[0]
                    _layer = Layer.objects.get(name__icontains=_layer_name)
                    affected_layers = [_layer]
                except BaseException:
                    logger.warn("Could not find any Layer %s on DB" % os.path.basename(request.path))

    kwargs = {'affected_layers': affected_layers}
    import urllib
    raw_url = urllib.unquote(raw_url).decode('utf8')
    timeout = getattr(ogc_server_settings, 'TIMEOUT') or 10
    allowed_hosts = [urlsplit(ogc_server_settings.public_url).hostname, ]
    return proxy(request, url=raw_url, response_callback=_response_callback,
                 timeout=timeout, allowed_hosts=allowed_hosts, **kwargs)
Esempio n. 47
0
def getFilenameFromURL(url):
    '''Gets the filename from a URL'''
    (unused_scheme, unused_netloc,
        path, unused_query, unused_fragment) = urlparse.urlsplit(url)
    return os.path.basename(path)
Esempio n. 48
0
 def separate_url(url):
     s = urlparse.urlsplit(url)
     if '@' not in s.netloc:
         parser.error('merged url netloc must contain an "@"')
     userpass, new_netloc = s.netloc.rsplit('@', 1)
     return urlparse.urlunsplit(s._replace(netloc=new_netloc)), userpass
Esempio n. 49
0
def subscribe(service, action='subscribe'):
    """
    send a subscribe/renewal/unsubscribe request to a service
    return the device response
    """
    log_category = "event_protocol"
    log.info(log_category, "event.subscribe, action: %r", action)

    _,host_port,path,_,_ = urlsplit(service.get_base_url())
    if host_port.find(':') != -1:
        host,port = tuple(host_port.split(':'))
        port = int(port)
    else:
        host = host_port
        port = 80

    def send_request(p, action):
        log.info(log_category, "event.subscribe.send_request %r, action: %r %r",
                 p, action, service.get_event_sub_url())
        _,_,event_path,_,_ = urlsplit(service.get_event_sub_url())
        if action == 'subscribe':
            timeout = service.timeout
            if timeout == 0:
                timeout = 1800
            request = ["SUBSCRIBE %s HTTP/1.1" % event_path,
                        "HOST: %s:%d" % (host, port),
                        "TIMEOUT: Second-%d" % timeout,
                        ]
            service.event_connection = p
        else:
            request = ["UNSUBSCRIBE %s HTTP/1.1" % event_path,
                        "HOST: %s:%d" % (host, port),
                        ]

        if service.get_sid():
            request.append("SID: %s" % service.get_sid())
        else:
            # XXX use address and port set in the coherence instance
            #ip_address = p.transport.getHost().host
            global hostname, web_server_port
            #print hostname, web_server_port
            url = 'http://%s:%d/events' % (hostname, web_server_port)
            request.append("CALLBACK: <%s>" % url)
            request.append("NT: upnp:event")

        request.append('Date: %s' % datetimeToString())
        request.append( "Content-Length: 0")
        request.append( "")
        request.append( "")
        request = '\r\n'.join(request)
        log.debug(log_category, "event.subscribe.send_request %r %r", request, p)
        try:
            p.transport.writeSomeData(request)
        except AttributeError:
            log.info(log_category, "transport for event %r already gone", action)
       # print "event.subscribe.send_request", d
        #return d

    def got_error(failure, action):
        log.info(log_category, "error on %s request with %s" % (action,service.get_base_url()))
        log.debug(log_category, failure)

    def teardown_connection(c, d):
        log.info(log_category, "event.subscribe.teardown_connection")
        del d
        del c

    def prepare_connection( service, action):
        log.info(log_category, "event.subscribe.prepare_connection action: %r %r",
                 action, service.event_connection)
        if service.event_connection == None:
            c = ClientCreator(reactor, EventProtocol, service=service, action=action)
            log.info(log_category, "event.subscribe.prepare_connection: %r %r",
                     host, port)
            d = c.connectTCP(host, port)
            d.addCallback(send_request, action=action)
            d.addErrback(got_error, action)
            #reactor.callLater(3, teardown_connection, c, d)
        else:
            d = defer.Deferred()
            d.addCallback(send_request, action=action)
            d.callback(service.event_connection)
            #send_request(service.event_connection, action)
        return d

    """ FIXME:
        we need to find a way to be sure that our unsubscribe calls get through
        on shutdown
        reactor.addSystemEventTrigger( 'before', 'shutdown', prepare_connection, service, action)
    """

    return prepare_connection(service, action)
Esempio n. 50
0
def post_url(url, fields, files=[]):
    urlparts = urlparse.urlsplit(url)
    return post_multipart(urlparts[1], urlparts[2], fields, files)
Esempio n. 51
0
    def __init__(self, params, status, calib_data, service_name):
        """

        :param params: pywws configuration.

        :type params: :class:`pywws.DataStore.params`
        
        :param status: pywws status store.

        :type status: :class:`pywws.DataStore.status`
        
        :param calib_data: 'calibrated' data.

        :type calib_data: :class:`pywws.DataStore.calib_store`

        :param service_name: name of service to upload to.

        :type service_name: string
    
        """
        self.logger = logging.getLogger('pywws.ToService(%s)' % service_name)
        self.params = params
        self.status = status
        self.data = calib_data
        self.service_name = service_name
        # 'derived' services such as 'underground_rf' share their
        # parent's config and templates
        config_section = self.service_name.split('_')[0]
        if config_section == self.service_name:
            self.parent = None
        else:
            self.parent = config_section
        self.old_response = None
        self.old_ex = None
        # set default socket timeout, so urlopen calls don't hang forever
        socket.setdefaulttimeout(30)
        # open params file
        service_params = SafeConfigParser()
        service_params.optionxform = str
        service_params.readfp(
            pkg_resources.resource_stream(
                'pywws', 'services/%s.ini' % (self.service_name)))
        # get URL
        self.server = service_params.get('config', 'url')
        parsed_url = urlparse.urlsplit(self.server)
        if parsed_url.scheme == 'aprs':
            self.send_data = self.aprs_send_data
            server, port = parsed_url.netloc.split(':')
            self.server = (server, int(port))
        elif parsed_url.scheme == 'mqtt':
            self.send_data = self.mqtt_send_data
        else:
            self.send_data = self.http_send_data
            self.use_get = eval(service_params.get('config', 'use get'))
        # get fixed part of upload data
        self.fixed_data = dict()
        for name, value in service_params.items('fixed'):
            if value[0] == '*':
                value = self.params.get(config_section, value[1:], 'unknown')
            self.fixed_data[name] = value
        # create templater
        self.templater = Template.Template(self.params,
                                           self.status,
                                           self.data,
                                           self.data,
                                           None,
                                           None,
                                           use_locale=False)
        template_name = 'services/%s_template_%s.txt' % (
            config_section, self.params.get('config', 'ws type'))
        if not pkg_resources.resource_exists('pywws', template_name):
            template_name = 'services/%s_template_1080.txt' % (config_section)
        self.template_file = pkg_resources.resource_stream(
            'pywws', template_name)
        # get other parameters
        self.auth_type = service_params.get('config', 'auth_type')
        if self.auth_type == 'basic':
            user = self.params.get(config_section, 'user', 'unknown')
            password = self.params.get(config_section, 'password', 'unknown')
            self.auth = 'Basic %s' % base64.b64encode('%s:%s' %
                                                      (user, password))
        self.catchup = eval(service_params.get('config', 'catchup'))
        self.expected_result = eval(service_params.get('config', 'result'))
        self.interval = eval(service_params.get('config', 'interval'))
        self.interval = max(self.interval, 40)
        self.interval = timedelta(seconds=self.interval)
        # move 'last update' from params to status
        last_update = self.params.get_datetime(self.service_name,
                                               'last update')
        if last_update:
            self.params.unset(self.service_name, 'last update')
            self.status.set('last update', self.service_name,
                            last_update.isoformat(' '))
        # set timestamp of first data to upload
        self.next_update = datetime.utcnow() - max(
            timedelta(days=self.catchup), self.interval)
Esempio n. 52
0
def delete(uri, filepath, depth="infinity"):
    """
    Perform a X{DELETE} operation on the given URI, which is backed by the given
    filepath.
    @param filepath: the L{FilePath} to delete.
    @param depth: the recursion X{Depth} for the X{DELETE} operation, which must
        be "infinity".
    @raise HTTPError: (containing a response with a status code of
        L{responsecode.BAD_REQUEST}) if C{depth} is not "infinity".
    @raise HTTPError: (containing an appropriate response) if the
        delete operation fails.  If C{filepath} is a directory, the response
        will be a L{MultiStatusResponse}.
    @return: a deferred response with a status code of L{responsecode.NO_CONTENT}
        if the X{DELETE} operation succeeds.
    """
    #
    # Remove the file(s)
    #
    # FIXME: defer
    if filepath.isdir():
        #
        # RFC 2518, section 8.6 says that we must act as if the Depth header is
        # set to infinity, and that the client must omit the Depth header or set
        # it to infinity, meaning that for collections, we will delete all
        # members.
        #
        # This seems somewhat at odds with the notion that a bad request should
        # be rejected outright; if the client sends a bad depth header, the
        # client is broken, and RFC 2518, section 8 suggests that a bad request
        # should be rejected...
        #
        # Let's play it safe for now and ignore broken clients.
        #

        if depth != "infinity":
            msg = ("Client sent illegal depth header value for DELETE: %s" %
                   (depth, ))
            log.error(msg)
            raise HTTPError(StatusResponse(responsecode.BAD_REQUEST, msg))

        #
        # Recursive delete
        #
        # RFC 2518, section 8.6 says that if we get an error deleting a resource
        # other than the collection in the request-URI, that we must respond
        # with a multi-status response containing error statuses for each
        # resource that we fail to delete.  It also says we should not return
        # no-content (success) status, which means that we should continue after
        # errors, rather than aborting right away.  This is interesting in that
        # it's different from how most operating system tools act (eg. rm) when
        # recursive filsystem deletes fail.
        #

        uri_path = urllib.unquote(urlsplit(uri)[2])
        if uri_path[-1] == "/":
            uri_path = uri_path[:-1]

        log.info("Deleting directory %s" % (filepath.path, ))

        # NOTE: len(uri_path) is wrong if os.sep is not one byte long... meh.
        request_basename = filepath.path[:-len(uri_path)]

        errors = ResponseQueue(request_basename, "DELETE",
                               responsecode.NO_CONTENT)

        # FIXME: defer this
        for dir, subdirs, files in os.walk(filepath.path, topdown=False):
            for filename in files:
                path = os.path.join(dir, filename)
                try:
                    os.remove(path)
                except:
                    errors.add(path, Failure())

            for subdir in subdirs:
                path = os.path.join(dir, subdir)
                if os.path.islink(path):
                    try:
                        os.remove(path)
                    except:
                        errors.add(path, Failure())
                else:
                    try:
                        os.rmdir(path)
                    except:
                        errors.add(path, Failure())

        try:
            os.rmdir(filepath.path)
        except:
            raise HTTPError(
                statusForFailure(Failure(),
                                 "deleting directory: %s" % (filepath.path, )))

        response = errors.response()

    else:
        #
        # Delete a file; much simpler, eh?
        #
        log.info("Deleting file %s" % (filepath.path, ))
        try:
            os.remove(filepath.path)
        except:
            raise HTTPError(
                statusForFailure(Failure(),
                                 "deleting file: %s" % (filepath.path, )))

        response = responsecode.NO_CONTENT

    # Remove stat info for filepath since we deleted the backing file
    filepath.changed()

    return succeed(response)
Esempio n. 53
0
def parse_detail_wiki(url, title, desc):
    """
    解析API明细页面
    :param url: wiki地址
    :param desc: api描述
    :return: void
    """
    print "parsing detail api wiki page...."
    print url

    content = requests.get(url).text
    soup = BeautifulSoup(content, "html.parser")

    api_dict = dict()

    # find basic information
    api_node = soup.find('a', class_='external free')
    if not api_node:
        print url + "does not contains an API path"
        return
    api_url = api_node["href"]
    api_path = urlparse.urlparse(api_url).path
    api_name = api_url.rsplit('/', 1)[1]
    api_category = api_path.rsplit('/')[2]
    api_category = api_category.replace('.', '').title()
    api_signature = api_name.rsplit('.', 1)[0].title().replace('_', "")
    api_dict["url"] = api_url
    api_dict["description"] = desc
    api_dict["method"] = "GET"
    api_dict["signature"] = api_signature
    api_dict["category"] = api_category
    api_dict["path"] = api_path
    api_dict["title"] = title

    console = soup.find('a', text='API测试工具')
    if console:
        console_url = console["href"]
        api_dict["console"] = console_url
        params = dict(urlparse.parse_qsl(urlparse.urlsplit(console_url).query))
        http_method = params.get('httpmethod')
        if http_method == 'None':
            api_dict["method"] = 'GET'
        else:
            api_dict["method"] = http_method

    # parse parameters and responses
    tables = soup.find_all('table', class_='parameters')
    if len(tables) == 2:
        parameters, needs_auth = parse_request_parameters(tables[0])
        api_dict["parameters"] = parameters
        api_dict["authorize"] = needs_auth
        response = parse_response_model(tables[1])
        api_dict["response"] = response

    elif len(tables) == 1:
        parameters, needs_auth = parse_request_parameters(tables[0])
        api_dict["parameters"] = parameters
        api_dict["authorize"] = needs_auth
        api_dict["response"] = []
    else:
        print "!!!!!!!!!! detail page error"
        return

    filename = api_url.rsplit('/', 1)[1]
    if not filename.endswith('json'):
        return

    file_path = builds_dir + "/" + api_category + "_" + filename
    with open(file_path, mode='w') as f:
        f.write(
            json.dumps(api_dict,
                       encoding='UTF-8',
                       ensure_ascii=False,
                       indent=4))
    print "parsing detail page done ====== "
Esempio n. 54
0
    def __cleanup_requests(self):
        """Cleanup handles that have finished their request.
                Return the handles to the freelist.  Generate any
                relevant error information."""

        count, good, bad = self.__mhandle.info_read()
        failures = self.__failures
        done_handles = []
        ex_to_raise = None

        for h, en, em in bad:

            # Get statistics for each handle.
            repostats = self.__xport.stats[h.repourl]
            repostats.record_tx()
            bytes = h.getinfo(pycurl.SIZE_DOWNLOAD)
            seconds = h.getinfo(pycurl.TOTAL_TIME)
            repostats.record_progress(bytes, seconds)

            httpcode = h.getinfo(pycurl.RESPONSE_CODE)
            url = h.url
            urlstem = h.repourl
            proto = urlparse.urlsplit(url)[0]

            # All of these are errors
            repostats.record_error()

            # If we were cancelled, raise an API error.
            # Otherwise fall through to transport's exception
            # generation.
            if en == pycurl.E_ABORTED_BY_CALLBACK:
                ex = None
                ex_to_raise = api_errors.CanceledException
            elif en == pycurl.E_HTTP_RETURNED_ERROR:
                ex = tx.TransportProtoError(proto,
                                            httpcode,
                                            url,
                                            repourl=urlstem)
            else:
                ex = tx.TransportFrameworkError(en, url, em, repourl=urlstem)

            if ex and ex.retryable:
                failures.append(ex)
            elif ex and not ex_to_raise:
                ex_to_raise = ex

            done_handles.append(h)

        for h in good:
            # Get statistics for each handle.
            repostats = self.__xport.stats[h.repourl]
            repostats.record_tx()
            bytes = h.getinfo(pycurl.SIZE_DOWNLOAD)
            seconds = h.getinfo(pycurl.TOTAL_TIME)
            h.filetime = h.getinfo(pycurl.INFO_FILETIME)
            repostats.record_progress(bytes, seconds)

            httpcode = h.getinfo(pycurl.RESPONSE_CODE)
            url = h.url
            urlstem = h.repourl
            proto = urlparse.urlsplit(url)[0]

            if httpcode == httplib.OK:
                h.success = True
            else:
                ex = tx.TransportProtoError(proto,
                                            httpcode,
                                            url,
                                            repourl=urlstem)

                # If code >= 400, record this as an error.
                # Handlers above the engine get to decide
                # for 200/300 codes that aren't OK
                if httpcode >= 400:
                    repostats.record_error()
                # If code == 0, libcurl failed to read
                # any HTTP status.  Response is almost
                # certainly corrupted.
                elif httpcode == 0:
                    reason = "Invalid HTTP status code " \
                        "from server"
                    ex = tx.TransportProtoError(proto,
                                                url=url,
                                                reason=reason,
                                                repourl=urlstem)
                    ex.retryable = True

                # Stash retryable failures, arrange
                # to raise first fatal error after
                # cleanup.
                if ex.retryable:
                    failures.append(ex)
                elif not ex_to_raise:
                    ex_to_raise = ex

            done_handles.append(h)

        # Call to remove_handle must be separate from info_read()
        for h in done_handles:
            self.__mhandle.remove_handle(h)
            self.__teardown_handle(h)
            self.__freehandles.append(h)

        self.__failures = failures

        if ex_to_raise:
            raise ex_to_raise
Esempio n. 55
0
    def render_template(self, template_name, output_name, context):
        data = self.template_system.render_template(template_name, None,
                                                    context,
                                                    self.GLOBAL_CONTEXT)

        assert output_name.startswith(self.config["OUTPUT_FOLDER"])
        url_part = output_name[len(self.config["OUTPUT_FOLDER"]) + 1:]

        # This is to support windows paths
        url_part = "/".join(url_part.split(os.sep))

        src = urlparse.urljoin(self.config["BLOG_URL"], url_part)

        parsed_src = urlparse.urlsplit(src)
        src_elems = parsed_src.path.split('/')[1:]

        def replacer(dst):
            # Refuse to replace links that are full URLs.
            dst_url = urlparse.urlparse(dst)
            if dst_url.netloc:
                if dst_url.scheme == 'link':  # Magic link
                    dst = self.link(dst_url.netloc, dst_url.path.lstrip('/'),
                                    context['lang'])
                else:
                    return dst

            # Normalize
            dst = urlparse.urljoin(src, dst)
            # Avoid empty links.
            if src == dst:
                return "#"
            # Check that link can be made relative, otherwise return dest
            parsed_dst = urlparse.urlsplit(dst)
            if parsed_src[:2] != parsed_dst[:2]:
                return dst

            # Now both paths are on the same site and absolute
            dst_elems = parsed_dst.path.split('/')[1:]

            i = 0
            for (i, s), d in zip(enumerate(src_elems), dst_elems):
                if s != d:
                    break
            # Now i is the longest common prefix
            result = '/'.join(['..'] * (len(src_elems) - i - 1) +
                              dst_elems[i:])

            if not result:
                result = "."

            # Don't forget the fragment (anchor) part of the link
            if parsed_dst.fragment:
                result += "#" + parsed_dst.fragment

            assert result, (src, dst, i, src_elems, dst_elems)

            return result

        try:
            os.makedirs(os.path.dirname(output_name))
        except:
            pass
        doc = lxml.html.document_fromstring(data)
        doc.rewrite_links(replacer)
        data = '<!DOCTYPE html>' + lxml.html.tostring(doc, encoding='utf8')
        with open(output_name, "w+") as post_file:
            post_file.write(data)
Esempio n. 56
0
 def thumbvid(self):
     vid = urlparse.parse_qs(urlparse.urlsplit(self.url).query)['v'][0]
     embed = '<iframe width="120" height="90"'
     embed += 'src="http://www.youtube.com/embed/%s"' % (vid, )
     embed += 'frameborder="0" allowfullscreen></iframe>'
     return embed
Esempio n. 57
0
def make_url_protocol_relative(url):
    if not url or url.startswith("//"):
        return url

    scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
    return urlparse.urlunsplit((None, netloc, path, query, fragment))
Esempio n. 58
0
    def __call__(self, env):
        is_https = (env['REQUEST_METHOD'] == 'CONNECT')

        # for non-https requests, check non-proxy urls
        if not is_https:
            url = env['REL_REQUEST_URI']

            if not url.startswith(('http://', 'https://')):
                return None

            env['pywb.proxy_scheme'] = 'http'

        route = None
        coll = None
        matcher = None
        response = None
        ts = None

        # check resolver, for pre connect resolve
        if self.resolver.pre_connect:
            route, coll, matcher, ts, response = self.resolver.resolve(env)
            if response:
                return response

        # do connect, then get updated url
        if is_https:
            response = self.handle_connect(env)
            if response:
                return response

            url = env['REL_REQUEST_URI']
        else:
            parts = urlparse.urlsplit(env['REL_REQUEST_URI'])
            hostport = parts.netloc.split(':', 1)
            env['pywb.proxy_host'] = hostport[0]
            env['pywb.proxy_port'] = hostport[1] if len(hostport) == 2 else ''
            env['pywb.proxy_req_uri'] = parts.path
            if parts.query:
                env['pywb.proxy_req_uri'] += '?' + parts.query

        env['pywb_proxy_magic'] = self.magic_name

        # route (static) and other resources to archival replay
        if env['pywb.proxy_host'] == self.magic_name:
            env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri']

            # special case for proxy install
            response = self.handle_cert_install(env)
            if response:
                return response

            return None

        # check resolver, post connect
        if not self.resolver.pre_connect:
            route, coll, matcher, ts, response = self.resolver.resolve(env)
            if response:
                return response

        host_prefix = env['pywb.proxy_scheme'] + '://' + self.magic_name
        rel_prefix = ''

        # special case for proxy calendar
        if (env['pywb.proxy_host'] == 'query.' + self.magic_name):
            url = env['pywb.proxy_req_uri'][1:]
            rel_prefix = '/'

        if ts is not None:
            url = ts + '/' + url

        wbrequest = route.request_class(
            env,
            request_uri=url,
            wb_url_str=url,
            coll=coll,
            host_prefix=host_prefix,
            rel_prefix=rel_prefix,
            wburl_class=route.handler.get_wburl_type(),
            urlrewriter_class=HttpsUrlRewriter,
            use_abs_prefix=False,
            is_proxy=True)

        if matcher:
            route.apply_filters(wbrequest, matcher)

        if self.unaltered:
            wbrequest.wb_url.mod = 'id_'
        elif is_https:
            wbrequest.wb_url.mod = 'bn_'

        response = route.handler(wbrequest)

        if wbrequest.wb_url and wbrequest.wb_url.is_replay():
            response.status_headers.replace_headers(self.extra_headers)

        return response
Esempio n. 59
0
def getOrigin(url):
    origin = urlparse.urlsplit(url)
    origin = urlparse.urlunparse(
        (origin.scheme, origin.netloc, '', '', '', ''))
    return origin
Esempio n. 60
0
def make_url_https(url):
    if not url or url.startswith("https://"):
        return url

    scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
    return urlparse.urlunsplit(("https", netloc, path, query, fragment))