Example #1
0
    def find_pingback_urls(self, urls):
        """Find the pingback urls of each urls"""
        pingback_urls = {}

        for url in urls:
            try:
                page = urlopen(url)
                headers = page.info()

                if 'text/' not in headers.get('Content-Type', '').lower():
                    continue

                server_url = headers.get('X-Pingback')
                if not server_url:
                    server_url = self.find_pingback_href(page.read())

                if server_url:
                    server_url_splitted = urlsplit(server_url)
                    if not server_url_splitted.netloc:
                        url_splitted = urlsplit(url)
                        server_url = '%s://%s%s' % (url_splitted.scheme,
                                                    url_splitted.netloc,
                                                    server_url)
                    pingback_urls[url] = server_url
            except IOError:
                pass
        return pingback_urls
Example #2
0
    def __init__(
        self,
        url=DEFAULT_URI,
        name=None,
        ssl_required=False,
        verbose=False,
        pedantic=False,
        socket_keepalive=False
    ):
        self._connect_timeout = None
        self._socket_keepalive = socket_keepalive
        self._socket = None
        self._socket_file = None
        self._subscriptions = {}
        self._next_sid = 1
        self._server = None
        self._server_index = 0

        if isinstance(url, (list, tuple)):
            urls = [urlparse.urlsplit(x) for x in url]
        else:
            urls = [urlparse.urlsplit(url)]

        self._options = {
            'url': urls,
            'name': name,
            'ssl_required': ssl_required,
            'verbose': verbose,
            'pedantic': pedantic
        }
    def __init__(self, registry, url="", auth=None, verify=False,
                 api_timeout=None):
        # Registry ip:port
        self.registry = urlsplit(registry).netloc
        # Service url, ip:port
        self.url = url
        # Authentication (user, password) or None. Used by request to do
        # basicauth
        self.auth = auth
        # Timeout for HTTP request
        self.api_timeout = api_timeout

        # Desired scope is the scope needed for the next operation on the
        # registry
        self.desired_scope = ""
        # Scope of the token we have
        self.scope = ""
        # Token used to authenticate
        self.token = ""
        # Boolean to enfore https checks. Used by request
        self.verify = verify

        # If we have no url then token are not required. get_new_token will not
        # be called
        if url:
            split = urlsplit(url)
            # user in url will take precedence over giver username
            if split.username and split.password:
                self.auth = (split.username, split.password)

            self.token_required = True
        else:
            self.token_required = False
Example #4
0
 def extract_password_row(self, row):
     res = ''
     hostname_split = urlparse.urlsplit(row[0])
     website = urlparse.urlunsplit((hostname_split.scheme, hostname_split.netloc, "", "", "")).strip('\n')
     username = ''
     password = ''
     form_url = ''
     user_field = ''
     pass_field = ''
     form_url_split = urlparse.urlsplit(row[1])
     form_url = urlparse.urlunsplit((form_url_split.scheme, form_url_split.netloc, "", "", "")).strip('\n')
     #print('\nusername = '******' password RAW = ', row[5])
     password = self.decode_password(row[5])
     try:
         username = row[3]
         try:
             password = self.decode_password(row[5])
             self.num_passwords += 1
             pass
         except:
             print('ERROR - password = '******'non password entry (blacklists - ignoring)')
     res = self.format_list_csv([website, username, form_url, user_field, pass_field, password])
     return res
Example #5
0
def main(GET):
    parser = argparse.ArgumentParser(description='Scrape a simple site.')
    parser.add_argument('url', help='the URL at which to begin')
    start_url = parser.parse_args().url
    starting_netloc = urlsplit(start_url).netloc
    url_filter = (lambda url: urlsplit(url).netloc == starting_netloc)
    scrape((GET, start_url), url_filter)
Example #6
0
 def zoom_article(self, ticket_id, article_id):
     art_descr = self.__db.article_description(article_id)
     if art_descr[4] & ART_TEXT:
         return eval(self.__db.article_message(article_id))
     self.echo("Zoom article:", ticket_id, article_id)
     url_beg = urlsplit(self.runtime.get("site"))[:3]
     params = (
         ("Action", "AgentTicketZoom"), ("Subaction", "ArticleUpdate"),
         ("TicketID", ticket_id), ("ArticleID", article_id),
         ("OTRSAgentInterface", self.runtime["OTRSAgentInterface"]))
     url = urlunsplit(url_beg + (urlencode(params), ""))
     pg = TicketsPage(self.core)
     page = pg.load(url)
     if page is None:
         return
     mail_header = page.get("mail_header", [])
     if "mail_src" in page:
         url = urlunsplit(url_beg[:2] + urlsplit(page["mail_src"])[2:])
         self.echo("Get message:", url)
         pg = MessagePage(self.core)
         try:
             mail_text = pg.load(url)
         except LoginError:
             mail_text = pg.login()
     else:
         mail_text = page["message_text"]
     if mail_header:
         mail_text.insert(0, ("\n",))
     for i in reversed(mail_header):
         mail_text.insert(0, ("%s\t%s\n" % i,))
     shrink_tupled_text(mail_text)
     self.__db.article_message(article_id, repr(mail_text))
     return mail_text
Example #7
0
def get_fetcher(url=None, *, item=dict()):
	RTMP_PROTOCOLS = {'rtmp', 'rtmpt', 'rtmpe', 'rtmpte'}
	
	url = item.get("url", url)
	if urlsplit(url).scheme in RTMP_PROTOCOLS:
		return RtmpFetcher(url, live=True)
	
	auth = comm.get_auth()
	protocol = urlsplit(auth['server']).scheme
	if protocol in RTMP_PROTOCOLS:
		(url, ext) = url.rsplit('.', 1) # strip the extension (.flv or .mp4)
		url = auth['playpath_prefix'] + url

		if ext == 'mp4':
			url = 'mp4:' + url

		rtmp_url = auth['rtmp_url']
		token = auth.get('token')
		if token:
		    # Cannot use urljoin() because
		    # the RTMP scheme would have to be added to its whitelist
		    rtmp_url += '?auth=' + token
		
		return RtmpFetcher(rtmp_url, playpath=url)
	else:
		return HdsFetcher(url, auth)
Example #8
0
    def https_open(self, request):
        """
        Send an HTTP request, which can be either GET or POST,
        depending on req.has_data()

        Args:
            request - instance of urllib2.Request
        """
        full_url = request.get_full_url()
        url_parts = parse.urlsplit(full_url)
        robo = None
        if url_parts.netloc in self.robots:
            robo = self.robots[url_parts.netloc]
        else:
            # Getting request url, for checking robots.txt
            host = parse.urlsplit(full_url)[1]
            rurl = parse.urlunparse(("http", host, "/robots.txt", "", ""))
            robo = reppy.cache.RobotsCache()
            robo.fetch(rurl, self.agent_name)
            self.robots[url_parts.netloc] = robo

        # Is url allow for crawler in robots.txt
        if robo.allowed(full_url, self.agent_name):
            # Return result of request
            return request.HTTPHandler.https_open(self, request)
        else:
            raise RuntimeError('Forbidden by robots.txt')
Example #9
0
    def _url(self, hashed_name_func, name, force=False, hashed_files=None):
        """
        Return the non-hashed URL in DEBUG mode.
        """
        if settings.DEBUG and not force:
            hashed_name, fragment = name, ''
        else:
            clean_name, fragment = urldefrag(name)
            if urlsplit(clean_name).path.endswith('/'):  # don't hash paths
                hashed_name = name
            else:
                args = (clean_name,)
                if hashed_files is not None:
                    args += (hashed_files,)
                hashed_name = hashed_name_func(*args)

        final_url = super().url(hashed_name)

        # Special casing for a @font-face hack, like url(myfont.eot?#iefix")
        # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
        query_fragment = '?#' in name  # [sic!]
        if fragment or query_fragment:
            urlparts = list(urlsplit(final_url))
            if fragment and not urlparts[4]:
                urlparts[4] = fragment
            if query_fragment and not urlparts[3]:
                urlparts[2] += '?'
            final_url = urlunsplit(urlparts)

        return unquote(final_url)
Example #10
0
    def test_flow(self):
        url = self.sp.make_auth_req()
        status, headers, _ = self.getPage(url)
        assert status == '303 See Other'

        url = self.get_redirect_location(headers)
        req = parse_qs(urlsplit(url).query)
        assert 'SAMLRequest' in req
        assert 'RelayState' in req

        action, body = self.idp.handle_auth_req(req['SAMLRequest'][0],
                                                req['RelayState'][0],
                                                BINDING_HTTP_REDIRECT,
                                                'test1')
        status, headers, body = self.getPage(action, method='POST',
                                             body=urlencode(body))
        assert status == '302 Found'

        url = self.get_redirect_location(headers)
        req = parse_qs(urlsplit(url).query)
        assert 'SAMLResponse' in req
        assert 'RelayState' in req
        resp = self.sp.parse_authn_request_response(req['SAMLResponse'][0],
                                                    BINDING_HTTP_REDIRECT)
        identity = resp.ava
        assert identity["displayName"][0] == "Test1"
        assert identity["sn"][0] == "test1@valueA"
        assert identity['o'][0] == "Small university"
def forwards(apps, schema_editor):
    MenuItem = apps.get_model('external_services', 'MenuItem')
    items = (MenuItem.objects.all()
             .exclude(service=None)
             .exclude(menu_url=None)
             .exclude(menu_url=''))
    errors = []
    for item in items:
        uri1 = urlsplit(item.menu_url)
        uri2 = urlsplit(item.service.url)
        if uri1.netloc and uri1.netloc != uri2.netloc:
            errors.append(item)
    if errors:
        print()
        msg = ['Database is in inconsistent state.']
        for item in errors:
            msg.append("  MenuItem(pk=%s): %s <> %s" % (item.pk, item.menu_url, item.service.url))
        msg.append("For above menuitems, domain in MenuItem.menu_url doesn't match domain in MenuItem.service.url.")
        msg.append("Database is in inconsistent state. Manual fixing is required.")
        raise RuntimeError('\n'.join(msg))
    for item in items:
        uri = urlsplit(item.menu_url)
        url = uri._replace(scheme='', netloc='').geturl()
        item.menu_url = url
        item.save(update_fields=['menu_url'])
Example #12
0
    def url(self, name, force=False):
        """
        Returns the real URL in DEBUG mode.
        """
        if settings.DEBUG and not force:
            hashed_name, fragment = name, ''
        else:
            clean_name, fragment = urldefrag(name)
            if urlsplit(clean_name).path.endswith('/'):  # don't hash paths
                hashed_name = name
            else:
                cache_key = self.cache_key(name)
                hashed_name = self.cache.get(cache_key)
                if hashed_name is None:
                    hashed_name = self.hashed_name(clean_name).replace('\\', '/')
                    # set the cache if there was a miss
                    # (e.g. if cache server goes down)
                    self.cache.set(cache_key, hashed_name)

        final_url = super(CachedFilesMixin, self).url(hashed_name)

        # Special casing for a @font-face hack, like url(myfont.eot?#iefix")
        # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
        query_fragment = '?#' in name  # [sic!]
        if fragment or query_fragment:
            urlparts = list(urlsplit(final_url))
            if fragment and not urlparts[4]:
                urlparts[4] = fragment
            if query_fragment and not urlparts[3]:
                urlparts[2] += '?'
            final_url = urlunsplit(urlparts)

        return unquote(final_url)
Example #13
0
def clean_url(value):
    """
    Taken from Django' URLField, this helps to normalize URLs. Raises a
    ValueError if an invalid url is passed.

    Example:

    >>> clean_url("www.google.com")
    "http://www.google.com"

    >>> clean_url("_.com")
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
    ValueError: Enter a valid URL.
    """
    if value:
        value = value.strip()
        value = value.encode('ascii', 'ignore').decode("utf-8")
        url_fields = list(urlsplit((value)))
        if not url_fields[0]:
            # If no URL scheme given, assume http://
            url_fields[0] = 'http'
        if not url_fields[1]:
            # Assume that if no domain is provided, that the path segment
            # contains the domain.
            url_fields[1] = url_fields[2]
            url_fields[2] = ''
            # Rebuild the url_fields list, since the domain segment may now
            # contain the path too.
            url_fields = list(urlsplit((urlunsplit(url_fields))))
        if not url_fields[2]:
            # the path portion may need to be added before query params
            url_fields[2] = '/'
        value = urlunsplit(url_fields)
    return value
Example #14
0
    def assertRedirects(self, response, expected_url, status_code=302,
                        target_status_code=200, host=None):
        """Asserts that a response redirected to a specific URL, and that the
        redirect URL can be loaded.

        Note that assertRedirects won't work for external links since it uses
        TestClient to do a request.
        """
        self.assertEqual(response.status_code, status_code,
            ("Response didn't redirect as expected: Response code was %d"
             " (expected %d)" % (response.status_code, status_code)))
        url = response['Location']
        scheme, netloc, path, query, fragment = urlsplit(url)
        e_scheme, e_netloc, e_path, e_query, e_fragment = urlsplit(expected_url)
        if not (e_scheme or e_netloc):
            expected_url = urlunsplit(('http', host or 'testserver', e_path,
                    e_query, e_fragment))
        self.assertEqual(url, expected_url,
            "Response redirected to '%s', expected '%s'" % (url, expected_url))

        # Get the redirection page, using the same client that was used
        # to obtain the original response.
        redirect_response = response.client.get(path, QueryDict(query))
        self.assertEqual(redirect_response.status_code, target_status_code,
            ("Couldn't retrieve redirection page '%s': response code was %d"
             " (expected %d)") %
                 (path, redirect_response.status_code, target_status_code))
Example #15
0
    def sitelinks(self, html_page, url):
        """Finds all links in the provided html page"""
        bs = BeautifulSoup(html_page)
        links = set()
        urlpart = urlsplit(url)

        try:
            for anchor in bs.find_all('a'):
                linkpart = list(urlsplit(anchor['href']))
                linkpart[4] = '' #remove the fragment

                if linkpart[0] == '':
                    linkpart[0] = urlpart.scheme

                if linkpart[1] == '':
                    linkpart[1] = urlpart.netloc

                if linkpart[0] == urlpart.scheme and linkpart[1] == urlpart.netloc:
                    if linkpart[2].startswith('/'):
                        links.add(urlunsplit(linkpart))
                    elif linkpart[2] != '':
                        #relative URL.
                        links.add(urljoin(url, linkpart[2]))
        except KeyError:
            pass

        return links
Example #16
0
 def hashed_name(self, name, content=None, filename=None):
     # `filename` is the name of file to hash if `content` isn't given.
     # `name` is the base name to construct the new hashed filename from.
     parsed_name = urlsplit(unquote(name))
     clean_name = parsed_name.path.strip()
     filename = (filename and urlsplit(unquote(filename)).path.strip()) or clean_name
     opened = content is None
     if opened:
         if not self.exists(filename):
             raise ValueError("The file '%s' could not be found with %r." % (filename, self))
         try:
             content = self.open(filename)
         except IOError:
             # Handle directory paths and fragments
             return name
     try:
         file_hash = self.file_hash(clean_name, content)
     finally:
         if opened:
             content.close()
     path, filename = os.path.split(clean_name)
     root, ext = os.path.splitext(filename)
     if file_hash is not None:
         file_hash = ".%s" % file_hash
     hashed_name = os.path.join(path, "%s%s%s" %
                                (root, file_hash, ext))
     unparsed_name = list(parsed_name)
     unparsed_name[2] = hashed_name
     # Special casing for a @font-face hack, like url(myfont.eot?#iefix")
     # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
     if '?#' in name and not unparsed_name[3]:
         unparsed_name[2] += '?'
     return urlunsplit(unparsed_name)
Example #17
0
File: infra.py Project: lachm/fbbot
 def __form_data(text, formid, params, soup=None, form_url=None):
     if type(params) is not dict:
         raise TypeError('Params must be a dict')
     if soup is None:
         soup = BeautifulSoup(text, 'html.parser')
     form = soup.find('form', attrs={'id': formid})
     action = form.attrs.get('action')
     if not urlsplit(action).netloc:
         if form_url is None or not urlsplit(form_url).netloc:
             raise ValueError('kwarg form_url must be specified if form '
                              'action lacks a host')
         action = urljoin(form_url, action)
     inputs = form.find_all('input') + form.find_all('textarea')
     for i in inputs:
         try:
             name = i.attrs['name']
             type_ = i.attrs['type']
             value = params.get(name)
             if type_ == 'submit':
                 continue
             elif type_ == 'hidden':
                 value = i.attrs['value'] if value is None else value
             elif value is None:
                 raise ValueError('kwarg params dictionary is missing a '
                                  'value for a non-hidden field')
         except KeyError:
             pass
         else:
             params[name] = value
     return Session.FormInfo(params=params, post_url=action)
Example #18
0
 def oauth(self, req, credentials = None, params = {}):
     #NOTE: While flickr supports HTTPS in its oauth endpoints, flickr
     #thinks that the HTTPS endpoints are being accessed via HTTP, and thus
     #constructs the signature base string accordingly, which
     #will hence not match the signature base string generated by
     #pyoauth1client. We solve this by replacing HTTPS with HTTP
     #when generating the signature base string, and then revert the change
     #after the base string is generated. This way the signature
     #base string will match the one generated by flickr even though
     #we are accessing the endpoints via HTTPS for ADDED SECURITY!!!111one
     x = urlsplit(req.url)
     if x.scheme == "https":
         #Remove the HTTPS Scheme
         https = True
         x = x._replace(scheme = "http")
         req = req._replace(url = urlunsplit(x))
     else:
         https = False
     y = super().oauth(req, credentials, params)
     if https:
         #Add back the HTTPS scheme
         x = urlsplit(y.url)
         x = x._replace(scheme = "https")
         y = y._replace(url = urlunsplit(x))
     return y
Example #19
0
    def run(self):
        while True:
            # grabs url from queue
            level, u = self.input_q.get()

            main = '{0.scheme}://{0.netloc}/'.format(urlsplit(u))

            # fetching urls
            if level < MAX_URL_LEVEL:
                html = _get_content(u)
                if not isinstance(html, list):
                    soup = bs(html)
                    for link in soup.find_all('a'):
                        href = link.get('href')
                        
                        if not href or len(href) < 2:
                            continue

                        # Check if URL is relative
                        elif not urlsplit(href)[0] and not urlsplit(href)[1]:
                            self.output_q.put((level+1, _url_discard(urljoin(u, href))))
                        
                        elif href.startswith(main):
                            self.output_q.put((level+1, _url_discard(href)))
                else:
                    # Place for possible error logs (:
                    pass

            # signals to queue job is done
            self.input_q.task_done()
Example #20
0
def _main():
    base_url = sys.argv[1]
    soup = bs4.BeautifulSoup(urlopen(base_url), from_encoding="windows-1252")
    index_urls = [urljoin(base_url, h3("a")[0]["href"]) for h3 in soup("h3")]
    for index_url in index_urls:
        try:
            resp = urlopen(index_url)
        except HTTPError as err:
            print(err, err.url, file=sys.stderr)
            print("Skipping..", file=sys.stderr)
            continue
        index_soup = bs4.BeautifulSoup(resp, from_encoding="iso-8859-1")
        index_path = urlsplit(index_url).path
        index_filepath = os.path.normpath("." + index_path)
        try:
            os.makedirs(os.path.dirname(index_filepath))
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise e
        for issue_url in iter_issue_urls(index_soup):
            issue_url = urljoin(index_url, issue_url)
            try:
                resp = urlopen(issue_url)
            except HTTPError as err:
                print(err, err.url, file=sys.stderr)
                print("Skipping..", file=sys.stderr)
                continue
            issue_soup = bs4.BeautifulSoup(resp, from_encoding="windows-1252")
            issue_path = urlsplit(issue_url).path
            issue_filepath = os.path.normpath("." + issue_path)
            with open(issue_filepath, "w") as f:
                print(klupu.clean_soup(issue_soup), file=f)
        with open(index_filepath, "w") as f:
            print(klupu.clean_soup(index_soup), file=f)
Example #21
0
def main(GET):
	global mail,error,error_list
	parser = argparse.ArgumentParser(description='Scrape a simple site.')
	parser.add_argument('url', help='the URL at which to begin')
	start_url = parser.parse_args().url
	starting_netloc = urlsplit(start_url).netloc
	url_filter = (lambda url: urlsplit(url).netloc == starting_netloc)
	scrape((GET, start_url), url_filter)
	print ("\n\nresult--------------------------------\nerror:%d" %(error))
	count = 1;
	for url in error_list:
		print(url)
	print("\n")
	for url in mail:
		print("[%d]url:%s" %(count,url))
		data = mail[url][0]
		if data:
			tmp = []
			for val in data:
				
				if not val in tmp:
					print (val)
				tmp.append(val)
			
		else:
			print("None")
		print ("")
		count+=1
Example #22
0
def parse_url(link):
    """Say Website Title information in channel"""
    baseurl = '{uri.scheme}://{uri.netloc}'.format(uri=urlsplit(link))
    path = urlsplit(link).path
    query = '?{uri.query}'.format(uri=urlsplit(link))
    try:
        headers = {'Accept-Encoding': 'utf-8',
                   'User-Agent': 'Mozilla/5.0'}
        response = get(baseurl + path + query, headers=headers)
    except:
        return
    if response.headers["Content-Type"] and "text/html" in response.headers["Content-Type"]:
        try:
            URL = BeautifulSoup(response.text, "html.parser")
        except:
            return
        if not URL.title:
            return
        if URL.title.string is None:
            return
        if len(URL.title.string) > 250:
            title=URL.title.string[0:250] + '…'
        else:
            title=URL.title.string
        return title.replace('\n', ' ').strip() + " (" + urlsplit(link).netloc + ")"
    else:
        return
Example #23
0
    def find_pingback_urls(self, urls):
        """
        Find the pingback URL for each URLs.
        """
        pingback_urls = {}

        for url in urls:
            try:
                page = urlopen(url)
                headers = page.info()

                server_url = headers.get('X-Pingback')

                if not server_url:
                    content_type = headers.get('Content-Type', '').split(
                        ';')[0].strip().lower()
                    if content_type in ['text/html', 'application/xhtml+xml']:
                        server_url = self.find_pingback_href(
                            page.read(5 * 1024))

                if server_url:
                    server_url_splitted = urlsplit(server_url)
                    if not server_url_splitted.netloc:
                        url_splitted = urlsplit(url)
                        server_url = '%s://%s%s' % (url_splitted.scheme,
                                                    url_splitted.netloc,
                                                    server_url)
                    pingback_urls[url] = server_url
            except IOError:
                pass
        return pingback_urls
Example #24
0
        def sendall(self, data, *args, **kw):

            self._sent_data.append(data)
            hostnames = [getattr(i.info, 'hostname', None) for i in HTTPretty._entries.keys()]
            self.fd.seek(0)
            try:
                requestline, _ = data.split(b'\r\n', 1)
                method, path, version = parse_requestline(requestline)
                is_parsing_headers = True
            except ValueError:
                is_parsing_headers = False

            if not is_parsing_headers:
                if len(self._sent_data) > 1:
                    headers, body = map(utf8, self._sent_data[-2:])

                    method, path, version = parse_requestline(headers)
                    split_url = urlsplit(path)

                    info = URIInfo(hostname=self._host, port=self._port,
                                   path=split_url.path,
                                   query=split_url.query)

                    # If we are sending more data to a dynamic response entry,
                    # we need to call the method again.
                    if self._entry and self._entry.dynamic_response:
                        self._entry.body(info, method, body, headers)

                    try:
                        return HTTPretty.historify_request(headers, body, False)

                    except Exception as e:
                        logging.error(traceback.format_exc(e))
                        return self._true_sendall(data, *args, **kw)

            # path might come with
            s = urlsplit(path)
            POTENTIAL_HTTP_PORTS.append(int(s.port or 80))
            headers, body = map(utf8, data.split(b'\r\n\r\n', 1))

            request = HTTPretty.historify_request(headers, body)

            info = URIInfo(hostname=self._host, port=self._port,
                           path=s.path,
                           query=s.query,
                           last_request=request)

            entries = []

            for matcher, value in HTTPretty._entries.items():
                if matcher.matches(info):
                    entries = value
                    break

            if not entries:
                self._true_sendall(data)
                return

            self._entry = matcher.get_next_entry(method)
            self._request = (info, body, headers)
Example #25
0
 def is_external_url(self, url, site_url):
     """
     Check if the URL is an external URL.
     """
     url_splitted = urlsplit(url)
     if not url_splitted.netloc:
         return False
     return url_splitted.netloc != urlsplit(site_url).netloc
Example #26
0
def urlFileName(url:str)->str:
    from os import path
    r = path.basename(parse.urlsplit(url).path)
    if r:
        return r
    r = path.basename(parse.urlsplit(url).query)
    assert r
    return r
Example #27
0
    def assertRedirects(self, response, expected_url, status_code=302,
                        target_status_code=200, host=None, msg_prefix=''):
        """Asserts that a response redirected to a specific URL, and that the
        redirect URL can be loaded.

        Note that assertRedirects won't work for external links since it uses
        TestClient to do a request.
        """
        if msg_prefix:
            msg_prefix += ": "

        if hasattr(response, 'redirect_chain'):
            # The request was a followed redirect
            self.assertTrue(len(response.redirect_chain) > 0,
                msg_prefix + "Response didn't redirect as expected: Response"
                " code was %d (expected %d)" %
                    (response.status_code, status_code))

            self.assertEqual(response.redirect_chain[0][1], status_code,
                msg_prefix + "Initial response didn't redirect as expected:"
                " Response code was %d (expected %d)" %
                    (response.redirect_chain[0][1], status_code))

            url, status_code = response.redirect_chain[-1]

            self.assertEqual(response.status_code, target_status_code,
                msg_prefix + "Response didn't redirect as expected: Final"
                " Response code was %d (expected %d)" %
                    (response.status_code, target_status_code))

        else:
            # Not a followed redirect
            self.assertEqual(response.status_code, status_code,
                msg_prefix + "Response didn't redirect as expected: Response"
                " code was %d (expected %d)" %
                    (response.status_code, status_code))

            url = response.url
            scheme, netloc, path, query, fragment = urlsplit(url)

            redirect_response = response.client.get(path, QueryDict(query))

            # Get the redirection page, using the same client that was used
            # to obtain the original response.
            self.assertEqual(redirect_response.status_code, target_status_code,
                msg_prefix + "Couldn't retrieve redirection page '%s':"
                " response code was %d (expected %d)" %
                    (path, redirect_response.status_code, target_status_code))

        e_scheme, e_netloc, e_path, e_query, e_fragment = urlsplit(
                                                              expected_url)
        if not (e_scheme or e_netloc):
            expected_url = urlunsplit(('http', host or 'testserver', e_path,
                e_query, e_fragment))

        self.assertEqual(url, expected_url,
            msg_prefix + "Response redirected to '%s', expected '%s'" %
                (url, expected_url))
Example #28
0
File: conf.py Project: mapleoin/osc
def parse_apisrv_url(scheme, apisrv):
    if apisrv.startswith('http://') or apisrv.startswith('https://'):
        return urlsplit(apisrv)[0:2]
    elif scheme != None:
        # the split/join is needed to get a proper url (e.g. without a trailing slash)
        return urlsplit(urljoin(scheme, apisrv))[0:2]
    else:
        msg = 'invalid apiurl \'%s\' (specify the protocol (http:// or https://))' % apisrv
        raise URLError(msg)
def link_clean(link_row):    
    link=link_row['links']
    try:
        urlsplit(link)
    except ValueError:
        clean_link='BROKEN'
    else:
        clean_link=link.decode("utf-8")
    return clean_link
Example #30
0
def main(GET):
    parser = argparse.ArgumentParser(description='Scrape a simple site.')
    parser.add_argument('url', help='the URL at which to begin')
    parser.add_argument("-n", "--number", type=int, help="the number of reachable website", default=15)
    numEXECUTE = parser.parse_args().number
    #print (numEXECUTE)
    start_url = parser.parse_args().url
    starting_netloc = urlsplit(start_url).netloc
    url_filter = (lambda url: urlsplit(url).netloc == starting_netloc)
    scrape((GET, start_url), url_filter, numEXECUTE)
Example #31
0
 def set_base_url(self):
     self.base_url = urlsplit(self.start_url)._replace(path="", query="").geturl()
Example #32
0
 def to_ip(url):
     value = parse.urlsplit(url)
     if value.netloc != "":
         return FQDN().to_ip(value.netloc)
     return None
Example #33
0
 def _request_favicon(self):
     scheme, netloc, *_ = urlsplit(self.reply_url)
     favicon_response = urllib.request.urlopen(
         f"{scheme}://{netloc}/favicon.ico")
     assert favicon_response.read() == b"Favicon is not provided."
Example #34
0
    def handle(self):
        if self.args.languages and not self.args.locale_dir:
            self.subparser.error(
                '--locale-dir is required if --languages is set.')

        if self.args.versions_dir:
            versions_directory = Path(self.args.versions_dir)

        data = {}
        languages = {'en'}
        localedir = self.args.locale_dir
        headers = ['Title', 'Description', 'Extension']

        if localedir:
            available_translations = [
                n for n in os.listdir(localedir)
                if os.path.isdir(os.path.join(localedir, n))
            ]
            if self.args.languages:
                for language in self.args.languages.split(','):
                    if language in available_translations:
                        languages.add(language)
                    else:
                        self.subparser.error(
                            f'translations to {language} are not available')
            else:
                languages.update(available_translations)

        for version in self.versions():
            public_download_url = version.download_url
            if self.args.versions_dir:
                version.download_url = (versions_directory / version.id /
                                        version.version).as_uri()

            # Add the extension's data.
            data.setdefault(
                version.id, {
                    'id': version.id,
                    'category': version.category,
                    'core': version.core,
                    'name': {},
                    'description': {},
                    'latest_version': None,
                    'versions': {},
                })

            # Add the version's metadata.
            version_data = {
                'id': version.id,
                'date': version.date,
                'version': version.version,
                'base_url': version.base_url,
                'download_url': public_download_url,
                'publisher': {
                    'name': version.repository_user,
                    'url': version.repository_user_page,
                },
                'metadata': version.metadata,
                'schemas': {},
                'codelists': {},
                'readme': {},
            }

            parsed = urlsplit(version_data['publisher']['url'])
            if parsed.netloc == 'github.com' and 'OCDS_GITHUB_ACCESS_TOKEN' in os.environ:
                api_url = f"https://api.github.com/users/{version_data['publisher']['name']}"
                headers = {
                    'Authorization':
                    f"token {os.getenv('OCDS_GITHUB_ACCESS_TOKEN')}"
                }
                version_data['publisher']['name'] = session.get(
                    api_url, headers=headers).json()['name']

            for language in sorted(languages):
                # Update the version's metadata and add the version's schema.
                translator = _translator(version, 'schema', localedir,
                                         language)

                translation = translate_extension_metadata_data(
                    version.metadata, translator, lang=language)
                for key in TRANSLATABLE_EXTENSION_METADATA_KEYWORDS:
                    version_data['metadata'][key][language] = translation[key][
                        language]

                for name in ('record-package-schema.json',
                             'release-package-schema.json',
                             'release-schema.json'):
                    version_data['schemas'].setdefault(name, {})

                    if name in version.schemas:
                        translation = translate_schema_data(
                            version.schemas[name], translator)
                        version_data['schemas'][name][language] = translation

                # Add the version's codelists.
                if version.codelists:
                    translator = _translator(version, 'codelists', localedir,
                                             language)
                    for name in sorted(version.codelists):
                        version_data['codelists'].setdefault(name, {})

                        codelist = version.codelists[name]
                        version_data['codelists'][name][language] = {}

                        translation = [
                            translator.gettext(fieldname)
                            for fieldname in codelist.fieldnames
                        ]
                        version_data['codelists'][name][language][
                            'fieldnames'] = translation

                        translation = translate_codelist_data(
                            codelist, translator, headers)
                        version_data['codelists'][name][language][
                            'rows'] = translation

                # Add the version's readme and documentation.
                translator = _translator(version, 'docs', localedir, language)

                translation = translate_markdown_data(
                    'README.md', version.remote('README.md'), translator)
                version_data['readme'][language] = translation

            data[version.id]['versions'][version.version] = version_data

        for _id in data:
            # Determine the latest version. See ocdsextensionregistry.util.get_latest_version().
            versions = data[_id]['versions']
            if len(versions) == 1:
                latest_version = list(versions)[0]
            elif 'master' in versions:
                latest_version = 'master'
            elif default_minor_version in versions:
                latest_version = default_minor_version
            else:
                dated = [kv for kv in versions.items() if kv[1]['date']]
                if dated:
                    latest_version = max(dated,
                                         key=lambda kv: kv[1]['date'])[0]
                else:
                    raise CommandError(
                        f"Couldn't determine latest version of {_id}")

            # Apply the latest version.
            data[_id]['latest_version'] = latest_version
            for field in ('name', 'description'):
                data[_id][field] = data[_id]['versions'][latest_version][
                    'metadata'][field]

        json_dump(data, sys.stdout)
def get_ftp_date(url):
    import ftputil
    pr = parse.urlsplit(url)
    with ftputil.FTPHost(pr.netloc, 'anonymous', '') as host:
        return host.path.getmtime(pr.path)
def scrape():
    browser = init_browser()
    mars_facts_data = {}

    nasa = "https://mars.nasa.gov/news/"
    browser.visit(nasa)
    time.sleep(2)

    html = browser.html
    soup = bs(html,"html.parser")

    #scrapping latest news about mars from nasa
    news_title = soup.find("div",class_="content_title").text
    news_paragraph = soup.find("div", class_="article_teaser_body").text
    mars_facts_data['news_title'] = news_title
    mars_facts_data['news_paragraph'] = news_paragraph 
    
    #Mars Featured Image
    nasa_image = "https://www.jpl.nasa.gov/spaceimages/?search=&category=featured#submit"
    browser.visit(nasa_image)
    time.sleep(2)

    from urllib.parse import urlsplit
    base_url = "{0.scheme}://{0.netloc}/".format(urlsplit(nasa_image))
    
    xpath = "//*[@id=\"page\"]/section[3]/div/ul/li[1]/a/div/div[2]/img"

    #Use splinter to click on the mars featured image
    #to bring the full resolution image
    results = browser.find_by_xpath(xpath)
    img = results[0]
    img.click()
    time.sleep(2)
    
    #get image url using BeautifulSoup
    html_image = browser.html
    soup = bs(html_image, "html.parser")
    img_url = soup.find("img", class_="fancybox-image")["src"]
    full_img_url = base_url + img_url
    mars_facts_data["featured_image"] = full_img_url
      
    # #### Mars Weather

    #get mars weather's latest tweet from the website
    url_weather = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url_weather)
    html_weather = browser.html
    soup = bs(html_weather, "html.parser")
    mars_weather = soup.find("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").text
    mars_facts_data["mars_weather"] = mars_weather

    # #### Mars Facts
    url_facts = "https://space-facts.com/mars/"
    time.sleep(2)
    table = pd.read_html(url_facts)
    table[0]

    df_mars_facts = table[0]
    df_mars_facts.columns = ["Parameter", "Values"]
    clean_table = df_mars_facts.set_index(["Parameter"])
    mars_html_table = clean_table.to_html()
    mars_html_table = mars_html_table.replace("\n", "")
    mars_facts_data["mars_facts_table"] = mars_html_table

    # #### Mars Hemisperes
    url_hemisphere = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url_hemisphere)

    #Getting the base url
    hemisphere_base_url = "{0.scheme}://{0.netloc}/".format(urlsplit(url_hemisphere))
    hemisphere_img_urls = []
    hemisphere_img_urls

    #1 Hemisphere
    hemisphere_img_urls = []
    results = browser.find_by_xpath( "//*[@id='product-section']/div[2]/div[1]/a/img").click()
    time.sleep(2)
    cerberus_open_click = browser.find_by_xpath( "//*[@id='wide-image-toggle']").click()
    time.sleep(1)
    cerberus_image = browser.html
    soup = bs(cerberus_image, "html.parser")
    cerberus_url = soup.find("img", class_="wide-image")["src"]
    cerberus_img_url = hemisphere_base_url + cerberus_url
    #print(cerberus_img_url)
    cerberus_title = soup.find("h2",class_="title").text
    #print(cerberus_title)
    back_button = browser.find_by_xpath("//*[@id='splashy']/div[1]/div[1]/div[3]/section/a").click()
    cerberus = {"image title":cerberus_title, "image url": cerberus_img_url}
    hemisphere_img_urls.append(cerberus)

    #2 Hemisphere    
    results1 = browser.find_by_xpath( "//*[@id='product-section']/div[2]/div[2]/a/img").click()
    time.sleep(2)
    schiaparelli_open_click = browser.find_by_xpath( "//*[@id='wide-image-toggle']").click()
    time.sleep(1)
    schiaparelli_image = browser.html
    soup = bs(schiaparelli_image, "html.parser")
    schiaparelli_url = soup.find("img", class_="wide-image")["src"]
    schiaparelli_img_url = hemisphere_base_url + schiaparelli_url
    #print(schiaparelli_img_url)
    schiaparelli_title = soup.find("h2",class_="title").text
    #print(schiaparelli_title)
    back_button = browser.find_by_xpath("//*[@id='splashy']/div[1]/div[1]/div[3]/section/a").click()
    schiaparelli = {"image title":schiaparelli_title, "image url": schiaparelli_img_url}
    hemisphere_img_urls.append(schiaparelli)

    #3 Hemisphere
    results1 = browser.find_by_xpath( "//*[@id='product-section']/div[2]/div[3]/a/img").click()
    time.sleep(2)
    syrtis_major_open_click = browser.find_by_xpath( "//*[@id='wide-image-toggle']").click()
    time.sleep(1)
    syrtis_major_image = browser.html
    soup = bs(syrtis_major_image, "html.parser")
    syrtis_major_url = soup.find("img", class_="wide-image")["src"]
    syrtis_major_img_url = hemisphere_base_url + syrtis_major_url
    #print(syrtis_major_img_url)
    syrtis_major_title = soup.find("h2",class_="title").text
    #print(syrtis_major_title)
    back_button = browser.find_by_xpath("//*[@id='splashy']/div[1]/div[1]/div[3]/section/a").click()
    syrtis_major = {"image title":syrtis_major_title, "image url": syrtis_major_img_url}
    hemisphere_img_urls.append(syrtis_major)

    #4 Hemisphere
    results1 = browser.find_by_xpath( "//*[@id='product-section']/div[2]/div[4]/a/img").click()
    time.sleep(2)
    valles_marineris_open_click = browser.find_by_xpath( "//*[@id='wide-image-toggle']").click()
    time.sleep(1)
    valles_marineris_image = browser.html
    soup = bs(valles_marineris_image, "html.parser")
    valles_marineris_url = soup.find("img", class_="wide-image")["src"]
    valles_marineris_img_url = hemisphere_base_url + syrtis_major_url
    #print(valles_marineris_img_url)
    valles_marineris_title = soup.find("h2",class_="title").text
    #print(valles_marineris_title)
    back_button = browser.find_by_xpath("//*[@id='splashy']/div[1]/div[1]/div[3]/section/a").click()
    valles_marineris = {"image title":valles_marineris_title, "image url": valles_marineris_img_url}
    hemisphere_img_urls.append(valles_marineris)

    mars_facts_data["hemisphere_img_url"] = hemisphere_img_urls

    return mars_facts_data
Example #37
0
def url_to_repo_name(url):
    """@todo"""
    path = urlsplit(url).path
    path = path[1:]
    (base, ext) = os.path.splitext(path)
    return base
Example #38
0
 def _get_base_path(self) -> str:
     return cast(str, urlsplit(self.location).path)
Example #39
0
 def base_path(self) -> str:
     if self.base_url:
         return urlsplit(self.base_url).path
     return self._get_base_path()
def UnpackUserTarball():
    tarballs = []
    userFiles = []
    if len(sys.argv) > 1:
        tarballs = sys.argv[1].split(',')
    if len(sys.argv) > 2:
        userFiles = sys.argv[2].split(',')

    jobDir = os.environ['WMAGENTJOBDIR']

    for tarball in tarballs:
        splitResult = urlsplit(tarball)
        tarFile = os.path.join(jobDir, os.path.basename(tarball))

        # Is it a URL or a file that exists in the jobDir?
        if splitResult[0] in ['xrootd', 'root']:
            logging.info("Fetching tarball %s through xrootd", tarball)
            try:
                subprocess.check_call(
                    ['xrdcp', '-d', '1', '-f', tarball, 'TEMP_TARBALL.tgz'])
                subprocess.check_call(['tar', 'xf', 'TEMP_TARBALL.tgz'])
            except subprocess.CalledProcessError:
                logging.error("Couldn't retrieve/extract file from xrootd")
                raise
            finally:
                if os.path.exists('TEMP_TARBALL.tgz'):
                    os.unlink('TEMP_TARBALL.tgz')

        elif splitResult[0] in ['http', 'https'] and splitResult[1]:
            retriever = getRetriever(splitResult[0])
            with tempfile.NamedTemporaryFile() as tempFile:
                if setHttpProxy(tarball):
                    try:
                        logging.info(
                            'Fetching URL tarball %s through proxy server',
                            tarball)
                        fileName, headers = retriever(tarball, tempFile.name)
                    except (RuntimeError, IOError):
                        del os.environ['http_proxy']
                        logging.warning(
                            'Fetching URL tarball %s after proxy server failure',
                            tarball)
                        fileName, headers = retriever(tarball, tempFile.name)
                else:
                    logging.info(
                        'Fetching URL tarball %s without proxy server',
                        tarball)
                    fileName, headers = retriever(tarball, tempFile.name)

                try:
                    subprocess.check_call(['tar', 'xf', fileName])
                except subprocess.CalledProcessError:
                    raise RuntimeError('Error extracting %s' % tarball)
        elif os.path.isfile(tarFile):
            logging.info("Untarring %s", tarFile)
            subprocess.check_call(['tar', 'xf', tarFile])
        else:
            raise IOError('%s does not exist' % tarFile)

    for userFile in userFiles:
        if userFile:
            logging.info("Moving '%s' to execution directory.", userFile)
            shutil.move(userFile, '..')

    return 0
Example #41
0
    def _enrich_layer_metadata(self, geonode_layer):
        workspace, layername = geonode_layer.name.split(
            ":") if ":" in geonode_layer.name else (None, geonode_layer.name)
        url = urlsplit(self.url)
        base_url = '%s://%s/' % (url.scheme, url.netloc)
        response = requests.get('%sapi/layers/?name=%s' %
                                (base_url, layername), {},
                                timeout=10,
                                verify=False)
        content = response.content
        status = response.status_code
        content_type = response.headers['Content-Type']

        if status == 200 and 'application/json' == content_type:
            try:
                if isinstance(content, bytes):
                    content = content.decode('UTF-8')
                _json_obj = json.loads(content)
                if _json_obj['meta']['total_count'] == 1:
                    _layer = _json_obj['objects'][0]
                    if _layer:
                        r_fields = {}

                        # Update plain fields
                        for field in GeoNodeServiceHandler.LAYER_FIELDS:
                            if field in _layer and _layer[field]:
                                r_fields[field] = _layer[field]
                        if r_fields:
                            Layer.objects.filter(id=geonode_layer.id).update(
                                **r_fields)
                            geonode_layer.refresh_from_db()

                        # Update Thumbnail
                        if "thumbnail_url" in _layer and _layer[
                                "thumbnail_url"]:
                            thumbnail_remote_url = _layer["thumbnail_url"]
                            _url = urlsplit(thumbnail_remote_url)
                            if not _url.scheme:
                                thumbnail_remote_url = "{}{}".format(
                                    geonode_layer.remote_service.service_url,
                                    _url.path)
                            resp, image = http_client.request(
                                thumbnail_remote_url)
                            if 'ServiceException' in str(image) or \
                               resp.status_code < 200 or resp.status_code > 299:
                                msg = 'Unable to obtain thumbnail: %s' % image
                                logger.debug(msg)

                                # Replace error message with None.
                                image = None

                            if image is not None:
                                thumbnail_name = 'layer-%s-thumb.png' % geonode_layer.uuid
                                geonode_layer.save_thumbnail(thumbnail_name,
                                                             image=image)
                            else:
                                self._create_layer_thumbnail(geonode_layer)
                        else:
                            self._create_layer_thumbnail(geonode_layer)

                        # Add Keywords
                        if "keywords" in _layer and _layer["keywords"]:
                            keywords = _layer["keywords"]
                            if keywords:
                                geonode_layer.keywords.clear()
                                geonode_layer.keywords.add(*keywords)

                        # Add Regions
                        if "regions" in _layer and _layer["regions"]:
                            (regions_resolved,
                             regions_unresolved) = resolve_regions(
                                 _layer["regions"])
                            if regions_resolved:
                                geonode_layer.regions.clear()
                                geonode_layer.regions.add(*regions_resolved)

                        # Add Topic Category
                        if "category__gn_description" in _layer and _layer[
                                "category__gn_description"]:
                            try:
                                categories = TopicCategory.objects.filter(
                                    Q(gn_description__iexact=_layer[
                                        "category__gn_description"]))
                                if categories:
                                    geonode_layer.category = categories[0]
                            except BaseException:
                                traceback.print_exc()
            except BaseException:
                traceback.print_exc()
            finally:
                geonode_layer.save()
Example #42
0
def crawlLinks(links):
    articlesContent = pd.DataFrame()

    for link in tqdm(list(links)):
        try:    
            rq = requests.get(link)
            domain = "{0.netloc}".format(urlsplit(link))
            category = re.search(domain + '/([^/]+)', link).group(1)
            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")

                if page.find({'class': 'article-post'}):
                    body = page.select('.article-post')[0]
                    headline = body.select('h1')[0].text if len(body.select('h1')) else ''
                    subtitle = None

                    #metadata
                    location = body.select('.location')[0].text if len(body.select('.location')) else ''
                    articleDate = body.select('.fa-calendar')[0].text if len(body.select('.fa-calendar')) else ''
                    views = body.select('.fa-eye')[0].text if len(body.select('.fa-eye')) else ''
                    comments = body.select('.fa-comments-o')[0].text if len(body.select('.fa-comments-o')) else ''
                    comments = comments.split(" ")[0] if comments != '' else ''
                    tags = ' - '.join([tag['a'].text for tag in body.select('.tags').select('li')])
                else: 
                    headline = page.select('.post-title')[0].text if len(page.select('.post-title')) else ''
                    subtitle = page.select('.post-subtitle')[0].text if len(page.select('.post-subtitle')) else ''

                    #metadata
                    simpleShare = page.select('.simple-share')[0] if len(page.select('.simple-share')) > 0 else ''
                    li = simpleShare.find_all('li')
                    location = li[0].text if len(li) > 0 else ''
                    articleDate = li[1].text if len(li) > 1 else ''
                    views = li[2].text if len(li) > 2 else ''
                    views = views.split(" ")[0] if views != '' else ''
                    comments = li[3].text if len(li) > 3 else ''
                    comments = comments.split(" ")[0] if comments != '' else ''
                    tags = ' - '.join([tag.a.text for tag in page.select('.tags-widget')[0].select('li')[1:]]) if len(page.select('.tags-widget')) > 0 else ''

                # 30 Дек. 2019, 16:13
                if articleDate != '':
                    month_name = re.search('([а-яА-Я]+)', articleDate)
                    if month_name is not None:
                        month_name = month_name.group(1)
                        articleDate = articleDate.replace(month_name, replace_month_with_digit(month_name))
                        articleDate = pd.to_datetime(articleDate, format='%d %m %Y,  %H:%M')

                article_text = clean_text(page.select('.post-content')[0].select('div')[2].text) if len(page.select('.post-content')) > 0 else ''

                articlesContent = articlesContent.append({'link': link,
                                                          'title': clean_text(headline),
                                                          'subtitle': clean_text(subtitle),
                                                          'location': clean_text(location),
                                                          'comments': clean_text(comments),
                                                          'date': articleDate,
                                                          'views': clean_text(views),
                                                          'category': category,
                                                          'tags': clean_text(tags),
                                                          'article_text': article_text},
                                                         ignore_index=True)
        except:
            continue

    return articlesContent
Example #43
0
def determine_file_path(asset_url, site_directory):
    folder = site_directory + aass.determine_storage_location(asset_url)
    filename = urlsplit(asset_url).path
    file_path = folder + '\\' + basename(filename)
    create_directory(folder)
    return file_path
Example #44
0
 def to_domain_name(url):
     value = parse.urlsplit(url)
     if value.netloc != "" and not IPAddress.is_valid(value.netloc):
         return value.netloc
     return None
Example #45
0
 def prepare_url(self, product):
     search = quote(product).replace('%20', '-')
     url = urljoin(self.base_url, search)
     return urlsplit(url)._replace(query="").geturl()
Example #46
0
    def _get_conn(self):
        '''Obtain connection to server and authentication token'''

        log.debug('started')

        if 'no-ssl' in self.options:
            ssl_context = None
        else:
            ssl_context = self.ssl_context

        headers = CaseInsensitiveDict()
        headers['X-Auth-User'] = self.login
        headers['X-Auth-Key'] = self.password

        with HTTPConnection(self.hostname,
                            self.port,
                            proxy=self.proxy,
                            ssl_context=ssl_context) as conn:
            conn.timeout = int(self.options.get('tcp-timeout', 20))

            for auth_path in ('/v1.0', '/auth/v1.0'):
                log.debug('GET %s', auth_path)
                conn.send_request('GET', auth_path, headers=headers)
                resp = conn.read_response()

                if resp.status in (404, 412):
                    log.debug('auth to %s failed, trying next path', auth_path)
                    conn.discard()
                    continue

                elif resp.status == 401:
                    raise AuthorizationError(resp.reason)

                elif resp.status > 299 or resp.status < 200:
                    raise HTTPError(resp.status, resp.reason, resp.headers)

                # Pylint can't infer SplitResult Types
                #pylint: disable=E1103
                self.auth_token = resp.headers['X-Auth-Token']
                o = urlsplit(resp.headers['X-Storage-Url'])
                self.auth_prefix = urllib.parse.unquote(o.path)
                if o.scheme == 'https':
                    ssl_context = self.ssl_context
                elif o.scheme == 'http':
                    ssl_context = None
                else:
                    # fall through to scheme used for authentication
                    pass

                # mock server can only handle one connection at a time
                # so we explicitly disconnect this connection before
                # opening the feature detection connection
                # (mock server handles both - storage and authentication)
                conn.disconnect()

                self._detect_features(o.hostname, o.port, ssl_context)

                conn = HTTPConnection(o.hostname,
                                      o.port,
                                      proxy=self.proxy,
                                      ssl_context=ssl_context)
                conn.timeout = int(self.options.get('tcp-timeout', 20))
                return conn

            raise RuntimeError('No valid authentication path found')
#   Created on 2018-08-14  15:53

"""
    urlparse 和 urlsplit 使分割url

    区别在于urlsplit有params, params不常用
"""

from urllib import parse

url = "https://www.baidu.com/p?wd=123&s=abc#a"  # 举栗子
url2 = "https://www.baidu.com/p;AAA?wd=123&s=abc#a"

result_0 = parse.urlparse(url)
result_1 = parse.urlsplit(url)

print("--- urlparse ---", parse.urlparse(url2), end="\n\n")
print("--- urlparse ---", result_0, end="\n\n")
print("--- urlsplit ---", result_1, end="\n\n")

print("类型:", type(result_1), end="\n\n")

print("--------------- urlparse ---------------")
print("scheme: ", result_0.scheme)
print("netloc: ", result_0.netloc)
print("path: ", result_0.path)
print("params: ", result_0.params)
print("query: ", result_0.query)
print("fragment:", result_0.fragment)
Example #48
0
if __name__ == "__main__":
    urlsplit_fuzzer = GrammarCoverageFuzzer(webbrowser_grammar,
                                            start_symbol="<urlsplit>")
    for i in range(5):
        print(urlsplit_fuzzer.fuzz())

from urllib.parse import urlsplit

if __package__ is None or __package__ == "":
    from Timer import Timer
else:
    from .Timer import Timer

if __name__ == "__main__":
    with Timer() as urlsplit_timer:
        urlsplit('http://www.fuzzingbook.org/', 'http', True)
    urlsplit_timer.elapsed_time()

if __name__ == "__main__":
    with Timer() as webbrowser_timer:
        webbrowser("http://www.fuzzingbook.org")
    webbrowser_timer.elapsed_time()

if __name__ == "__main__":
    webbrowser_timer.elapsed_time() / urlsplit_timer.elapsed_time()

# ## Synopsis

if __name__ == "__main__":
    print('\n## Synopsis')
Example #49
0
def url_join(base, *args):
    """
    Helper function to join an arbitrary number of url segments together.
    """
    # Python2 urlsplit can't handle bytearray (TypeError: unhashable type)
    if isinstance(base, bytearray):
        base = bytes(base)

    if isinstance(base, bytes):
        needbytes = True
    else:
        needbytes = False

    try:
        scheme, netloc, path, query, fragment = urlsplit(base)
    except UnicodeDecodeError:
        # PY3 urlsplit uses implicit (ASCII) encoding to decode bytes, but we
        # use latin1 since re-encoding after urlsplit exactly reverses decode
        # for any ASCII superset (needed  for posixpath.join to work,
        # since EBCDIC codes / as \x61 [a]).  This "trick" allows use of ASCII
        # supersets for bytes URL in the original base.
        base = base.decode('latin1')
        scheme, netloc, path, query, fragment = (x.encode('latin1')
                                                 for x in urlsplit(base))
    if not len(path):
        if needbytes:
            path = b('/')
        else:
            path = u('/')
    newargs = []
    try:
        for x in args:
            if needbytes:
                # Although they don't need conversion, bytes args must be ASCII
                # as we cannot know they use the same encoding as base URL.
                if isinstance(x, bytes) or isinstance(x, bytearray):
                    newargs.append(x.decode('ascii').encode('ascii'))
                else:
                    if not isinstance(x, text_type):
                        x = '%s' % x
                    newargs.append(x.encode('ascii'))
            else:
                if isinstance(x, bytes) or isinstance(x, bytearray):
                    newargs.append(x.decode('ascii'))
                else:
                    if not isinstance(x, text_type):
                        x = '%s' % x
                    newargs.append(x)
        path = posixpath.join(path, *newargs)
        if PY3 and needbytes:
            # PY3 urlunsplit uses implicit (ASCII) encoding to decode bytes,
            # but we use latin1 since re-encoding after urlunsplit exactly
            # reverses decode for any ASCII superset (needed for posixpath.join
            # to work, since EBCDIC codes / as \x61 [a]).  This "trick" allows
            # use of ASCII supersets for bytes URLs (but not args, for safety).
            return urlunsplit([x.decode('latin1') for x in
                               [scheme, netloc, path, query, fragment]]
                              ).encode('latin1')
    except UnicodeError:
        raise TypeError("Can't mix non-ASCII bytes and strings in URL paths.")
    return urlunsplit([scheme, netloc, path, query, fragment])
Example #50
0
    def run(self, pool_size):
        """
		this function manages the parallel processing of the url list using the python Pool class

		the function first reads the list of urls out of the page_lists directory, cleans it
			for known issues (eg common binary files), and issues with idna encoding (tricky!)

		then the page list is mapped to the process_url function  and executed in parallell

		pool_size is defined in the run_webxray.py file, see details there
		"""

        # the list of url MUST be in the page_lists directory!
        try:
            url_list = open(
                os.path.dirname(os.path.abspath(__file__)) +
                '/../page_lists/' + self.pages_file_name, 'r')
        except:
            print(
                'File "%s" does not exist, file must be in ./page_lists directory.  Exiting.'
                % self.pages_file_name)
            exit()

        # set up sql connection used to determine if items are already in the db
        if self.db_engine == 'mysql':
            from webxray.MySQLDriver import MySQLDriver
            sql_driver = MySQLDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        elif self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)

        # this list gets mapped to the Pool, very important!
        urls_to_process = set()

        # simple counter used solely for updates to CLI
        count = 0

        print('\t------------------------')
        print('\t Building List of Pages ')
        print('\t------------------------')

        for url in url_list:
            # skip lines that are comments
            if "#" in url[0]: continue

            count += 1

            # only do lines starting with https?://
            if not (re.match('^https?://.+', url)):
                print("\t\t%s | %-50s Not a valid address, Skipping." %
                      (count, url[:50]))
                continue

            # non-ascii domains will crash phantomjs, so we need to convert them to
            # 	idna/ascii/utf-8
            # this requires splitting apart the url, converting the domain to idna,
            #	and pasting it all back together

            split_url = urlsplit(url.strip())
            idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8')
            url = urlunsplit(
                (split_url.scheme, idna_fixed_netloc, split_url.path,
                 split_url.query, split_url.fragment))

            # if it is a m$ office or other doc, skip
            if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', url):
                print("\t\t%s | %-50s Not an HTML document, Skipping." %
                      (count, url[:50]))
                continue

            # skip if in db already unless we are doing a timeseries
            if self.allow_timeseries == False:
                if sql_driver.page_exists(url):
                    print("\t\t%s | %-50s Exists in DB, Skipping." %
                          (count, url[:50]))
                    continue

            # only add if not in list already
            if url not in urls_to_process:
                print("\t\t%s | %-50s Adding." % (count, url[:50]))
                urls_to_process.add(url)
            else:
                print("\t\t%s | %-50s Already queued, Skipping." %
                      (count, url[:50]))

        # close the db connection
        sql_driver.close()

        print('\t----------------------------------')
        print('\t%s addresses will now be webXray\'d' % len(urls_to_process))
        print('\t\tBrowser(s) are %s' % self.browser_types)
        print('\t\tBrowser wait time is %s seconds' % self.browser_wait)
        print('\t\t...you can go take a walk. ;-)')
        print('\t----------------------------------')

        # for macOS (darwin) we must specify start method as 'forkserver'
        #	this is essentially voodoo to ward off evil spirits which
        #	appear when large pool sizes are used on macOS
        # get_start_method must be set to 'allow_none', otherwise upon
        #	checking the method it gets set (!) - and if we then get/set again
        #	we get an error
        if sys.platform == 'darwin' and multiprocessing.get_start_method(
                allow_none=True) != 'forkserver':
            multiprocessing.set_start_method('forkserver')
        myPool = multiprocessing.Pool(pool_size)
        myPool.map(self.process_url, urls_to_process)

        # FYI
        self.print_runtime()
Example #51
0
 def url(self, url):
     if urlparse.urlsplit(url).netloc is None:
         return self.url(url)
     body = {"url": url}
     return self.send_session_command("POST", "url", body)
Example #52
0
print(result)

result = urlparse('http://www.baidu.com/index.html#comment',
                  allow_fragments=False)
print(result)

result = urlparse('http://www.baidu.com/index.html#comment',
                  allow_fragments=False)
print(result.scheme, result[0], result.netloc, result[1], sep='\n')

# url unparse
data = ['http', 'www.baidu.com', 'index.html', 'user', 'id=6', 'comment']
print(urlunparse(data))

# url split
result = urlsplit('www.baidu.com/index.html;user?id=5#comment')
print(result)

# url unsplit
data = ['http', 'www.baidu.com', 'index.html', 'id=6', 'comment']
print(urlunsplit(data))

print(urljoin('http://www.baidu.com', 'FAQ.html'))
print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html'))
print(
    urljoin('http://www.baidu.com/about.html',
            'https://cuiqingcai.com/FAQ.html'))
print(
    urljoin('http://www.baidu.com/about.html',
            'https://cuiqingcai.com/FAQ.html?question=2'))
print(
# set of already crawled urls for email
processed_urls = set()

# a set of fetched emails
emails = set()

# process urls one by one from unprocessed_url queue until queue is empty
while len(unprocessed_urls):

    # move next url from the queue to the set of processed urls
    url = unprocessed_urls.popleft()
    processed_urls.add(url)

    # extract base url to resolve relative links
    parts = urlsplit(url)
    base_url = "{0.scheme}://{0.netloc}".format(parts)
    path = url[:url.rfind("/") + 1] if "/" in parts.path else url

    # get url's content
    print("Crawling URL %s" % url)
    try:
        response = requests.get(url)
    except (requests.exceptions.MissingSchema,
            requests.exceptions.ConnectionError):
        # ignore pages with errors and continue with next url
        continue

    # extract all email addresses and add them into the resulting set
    # You may edit the regular expression as per your requirement
    new_emails = set(
Example #54
0
        body=db_patch_body,
        instance=INSTANCE_NAME,
        database=DB_NAME,
        task_id='sql_db_patch_task',
    )
    sql_db_patch_task2 = CloudSQLPatchInstanceDatabaseOperator(
        body=db_patch_body,
        instance=INSTANCE_NAME,
        database=DB_NAME,
        task_id='sql_db_patch_task2')
    # [END howto_operator_cloudsql_db_patch]

    # ############################################## #
    # ### EXPORTING SQL FROM INSTANCE 1 ############ #
    # ############################################## #
    export_url_split = urlsplit(EXPORT_URI)

    # For export to work we need to add the Cloud SQL instance's Service Account
    # write access to the destination GCS bucket.
    # [START howto_operator_cloudsql_export_gcs_permissions]
    sql_gcp_add_bucket_permission_task = GCSBucketCreateAclEntryOperator(
        entity="user-{{ task_instance.xcom_pull("
        "'sql_instance_create_task', key='service_account_email') "
        "}}",
        role="WRITER",
        bucket=export_url_split[1],  # netloc (bucket)
        task_id='sql_gcp_add_bucket_permission_task',
    )
    # [END howto_operator_cloudsql_export_gcs_permissions]

    # [START howto_operator_cloudsql_export]
Example #55
0
from urllib import parse

# url = input("输入完整url")
from utils import get_lower_case_name

url = input("请输入完整地址:")
model = input("请输入模型名称:")
qs = parse.parse_qs(parse.urlsplit(url).query).keys()
# qs = parse.parse_qs(parse.urlsplit(url).query).keys()
remove_list = ["pageIndex", "pageSize"]

remain_qs = [one for one in qs if one not in remove_list]

row_list = []
for one in remain_qs:
    # if get_lower_case_name(one) != one:
    one = f"""    {one} = filters.CharFilter(field_name="{get_lower_case_name(one)}", lookup_expr='icontains')\n"""
    row_list.append(one)
txt = f"""
class {model}Filter(filters.FilterSet):
{"".join(row_list)}
    class Meta:
        model = {model}
        fields = {remain_qs}

filter_class = {model}Filter
"""

print(txt)
Example #56
0
 def domain(self) -> str:
     url = urlsplit(self.url)
     return '{}://{}'.format(url.scheme, url.netloc)
Example #57
0
def main(apiurl, opts, argv):

    repo = argv[0]
    arch = argv[1]
    build_descr = argv[2]
    xp = []
    build_root = None
    cache_dir  = None
    build_uid = ''
    vm_memory = config['build-memory']
    vm_type = config['build-type']
    vm_telnet = None

    build_descr = os.path.abspath(build_descr)
    build_type = os.path.splitext(build_descr)[1][1:]
    if os.path.basename(build_descr) == 'PKGBUILD':
        build_type = 'arch'
    if os.path.basename(build_descr) == 'build.collax':
        build_type = 'collax'
    if os.path.basename(build_descr) == 'appimage.yml':
        build_type = 'appimage'
    if os.path.basename(build_descr) == 'snapcraft.yaml':
        build_type = 'snapcraft'
    if build_type not in ['spec', 'dsc', 'kiwi', 'arch', 'collax', 'livebuild', 'snapcraft', 'appimage']:
        raise oscerr.WrongArgs(
                'Unknown build type: \'%s\'. Build description should end in .spec, .dsc, .kiwi, or .livebuild. Or being named PKGBUILD, build.collax, appimage.yml or snapcraft.yaml' \
                        % build_type)
    if not os.path.isfile(build_descr):
        raise oscerr.WrongArgs('Error: build description file named \'%s\' does not exist.' % build_descr)

    buildargs = []
    if not opts.userootforbuild:
        buildargs.append('--norootforbuild')
    if opts.clean:
        buildargs.append('--clean')
    if opts.noinit:
        buildargs.append('--noinit')
    if opts.nochecks:
        buildargs.append('--no-checks')
    if not opts.no_changelog:
        buildargs.append('--changelog')
    if opts.root:
        build_root = opts.root
    if opts.target:
        buildargs.append('--target=%s' % opts.target)
    if opts.threads:
        buildargs.append('--threads=%s' % opts.threads)
    if opts.jobs:
        buildargs.append('--jobs=%s' % opts.jobs)
    elif config['build-jobs'] > 1:
        buildargs.append('--jobs=%s' % config['build-jobs'])
    if opts.icecream or config['icecream'] != '0':
        if opts.icecream:
            num = opts.icecream
        else:
            num = config['icecream']

        if int(num) > 0:
            buildargs.append('--icecream=%s' % num)
            xp.append('icecream')
            xp.append('gcc-c++')
    if opts.ccache:
        buildargs.append('--ccache')
        xp.append('ccache')
    if opts.linksources:
        buildargs.append('--linksources')
    if opts.baselibs:
        buildargs.append('--baselibs')
    if opts.debuginfo:
        buildargs.append('--debug')
    if opts._with:
        for o in opts._with:
            buildargs.append('--with=%s' % o)
    if opts.without:
        for o in opts.without:
            buildargs.append('--without=%s' % o)
    if opts.define:
        for o in opts.define:
            buildargs.append('--define=%s' % o)
    if config['build-uid']:
        build_uid = config['build-uid']
    if opts.build_uid:
        build_uid = opts.build_uid
    if build_uid:
        buildidre = re.compile('^[0-9]{1,5}:[0-9]{1,5}$')
        if build_uid == 'caller':
            buildargs.append('--uid=%s:%s' % (os.getuid(), os.getgid()))
        elif buildidre.match(build_uid):
            buildargs.append('--uid=%s' % build_uid)
        else:
            print('Error: build-uid arg must be 2 colon separated numerics: "uid:gid" or "caller"', file=sys.stderr)
            return 1
    if opts.vm_memory:
        vm_memory = opts.vm_memory
    if opts.vm_type:
        vm_type = opts.vm_type
    if opts.vm_telnet:
        vm_telnet = opts.vm_telnet
    if opts.alternative_project:
        prj = opts.alternative_project
        pac = '_repository'
    else:
        prj = store_read_project(os.curdir)
        if opts.local_package:
            pac = '_repository'
        else:
            pac = store_read_package(os.curdir)
    if opts.shell:
        buildargs.append("--shell")

    orig_build_root = config['build-root']
    # make it possible to override configuration of the rc file
    for var in ['OSC_PACKAGECACHEDIR', 'OSC_SU_WRAPPER', 'OSC_BUILD_ROOT']:
        val = os.getenv(var)
        if val:
            if var.startswith('OSC_'): var = var[4:]
            var = var.lower().replace('_', '-')
            if var in config:
                print('Overriding config value for %s=\'%s\' with \'%s\'' % (var, config[var], val))
            config[var] = val

    pacname = pac
    if pacname == '_repository':
        if not opts.local_package:
            try:
                pacname = store_read_package(os.curdir)
            except oscerr.NoWorkingCopy:
                opts.local_package = True
        if opts.local_package:
            pacname = os.path.splitext(os.path.basename(build_descr))[0]
    apihost = urlsplit(apiurl)[1]
    if not build_root:
        build_root = config['build-root']
        if build_root == orig_build_root:
            # ENV var was not set
            build_root = config['api_host_options'][apiurl].get('build-root', build_root)
        try:
            build_root = build_root % {'repo': repo, 'arch': arch,
                         'project': prj, 'package': pacname, 'apihost': apihost}
        except:
            pass

    cache_dir = config['packagecachedir'] % {'apihost': apihost}

    extra_pkgs = []
    if not opts.extra_pkgs:
        extra_pkgs = config['extra-pkgs']
    elif opts.extra_pkgs != ['']:
        extra_pkgs = opts.extra_pkgs

    if xp:
        extra_pkgs += xp

    prefer_pkgs = {}
    build_descr_data = open(build_descr).read()

    # XXX: dirty hack but there's no api to provide custom defines
    if opts.without:
        s = ''
        for i in opts.without:
            s += "%%define _without_%s 1\n" % i
        build_descr_data = s + build_descr_data
    if opts._with:
        s = ''
        for i in opts._with:
            s += "%%define _with_%s 1\n" % i
        build_descr_data = s + build_descr_data
    if opts.define:
        s = ''
        for i in opts.define:
            s += "%%define %s\n" % i
        build_descr_data = s + build_descr_data

    cpiodata = None
    servicefile = os.path.join(os.path.dirname(build_descr), "_service")
    if not os.path.isfile(servicefile):
        servicefile = os.path.join(os.path.dirname(build_descr), "_service")
        if not os.path.isfile(servicefile):
            servicefile = None
        else:
            print('Using local _service file')
    buildenvfile = os.path.join(os.path.dirname(build_descr), "_buildenv." + repo + "." + arch)
    if not os.path.isfile(buildenvfile):
        buildenvfile = os.path.join(os.path.dirname(build_descr), "_buildenv")
        if not os.path.isfile(buildenvfile):
            buildenvfile = None
        else:
            print('Using local buildenv file: %s' % os.path.basename(buildenvfile))
    if buildenvfile or servicefile:
        from .util import cpio
        if not cpiodata:
            cpiodata = cpio.CpioWrite()

    if opts.prefer_pkgs:
        print('Scanning the following dirs for local packages: %s' % ', '.join(opts.prefer_pkgs))
        from .util import cpio
        if not cpiodata:
            cpiodata = cpio.CpioWrite()
        prefer_pkgs = get_prefer_pkgs(opts.prefer_pkgs, arch, build_type, cpiodata)

    if cpiodata:
        cpiodata.add(os.path.basename(build_descr), build_descr_data)
        # buildenv must come last for compatibility reasons...
        if buildenvfile:
            cpiodata.add("buildenv", open(buildenvfile).read())
        if servicefile:
            cpiodata.add("_service", open(servicefile).read())
        build_descr_data = cpiodata.get()

    # special handling for overlay and rsync-src/dest
    specialcmdopts = []
    if opts.rsyncsrc or opts.rsyncdest :
        if not opts.rsyncsrc or not opts.rsyncdest:
            raise oscerr.WrongOptions('When using --rsync-{src,dest} both parameters have to be specified.')
        myrsyncsrc = os.path.abspath(os.path.expanduser(os.path.expandvars(opts.rsyncsrc)))
        if not os.path.isdir(myrsyncsrc):
            raise oscerr.WrongOptions('--rsync-src %s is no valid directory!' % opts.rsyncsrc)
        # can't check destination - its in the target chroot ;) - but we can check for sanity
        myrsyncdest = os.path.expandvars(opts.rsyncdest)
        if not os.path.isabs(myrsyncdest):
            raise oscerr.WrongOptions('--rsync-dest %s is no absolute path (starting with \'/\')!' % opts.rsyncdest)
        specialcmdopts = ['--rsync-src='+myrsyncsrc, '--rsync-dest='+myrsyncdest]
    if opts.overlay:
        myoverlay = os.path.abspath(os.path.expanduser(os.path.expandvars(opts.overlay)))
        if not os.path.isdir(myoverlay):
            raise oscerr.WrongOptions('--overlay %s is no valid directory!' % opts.overlay)
        specialcmdopts += ['--overlay='+myoverlay]

    bi_file = None
    bc_file = None
    bi_filename = '_buildinfo-%s-%s.xml' % (repo, arch)
    bc_filename = '_buildconfig-%s-%s' % (repo, arch)
    if is_package_dir('.') and os.access(osc.core.store, os.W_OK):
        bi_filename = os.path.join(os.getcwd(), osc.core.store, bi_filename)
        bc_filename = os.path.join(os.getcwd(), osc.core.store, bc_filename)
    elif not os.access('.', os.W_OK):
        bi_file = NamedTemporaryFile(prefix=bi_filename)
        bi_filename = bi_file.name
        bc_file = NamedTemporaryFile(prefix=bc_filename)
        bc_filename = bc_file.name
    else:
        bi_filename = os.path.abspath(bi_filename)
        bc_filename = os.path.abspath(bc_filename)

    try:
        if opts.noinit:
            if not os.path.isfile(bi_filename):
                raise oscerr.WrongOptions('--noinit is not possible, no local buildinfo file')
            print('Use local \'%s\' file as buildinfo' % bi_filename)
            if not os.path.isfile(bc_filename):
                raise oscerr.WrongOptions('--noinit is not possible, no local buildconfig file')
            print('Use local \'%s\' file as buildconfig' % bc_filename)
        elif opts.offline:
            if not os.path.isfile(bi_filename):
                raise oscerr.WrongOptions('--offline is not possible, no local buildinfo file')
            print('Use local \'%s\' file as buildinfo' % bi_filename)
            if not os.path.isfile(bc_filename):
                raise oscerr.WrongOptions('--offline is not possible, no local buildconfig file')
        else:
            print('Getting buildinfo from server and store to %s' % bi_filename)
            bi_text = ''.join(get_buildinfo(apiurl,
                                            prj,
                                            pac,
                                            repo,
                                            arch,
                                            specfile=build_descr_data,
                                            addlist=extra_pkgs))
            if not bi_file:
                bi_file = open(bi_filename, 'w')
            # maybe we should check for errors before saving the file
            bi_file.write(bi_text)
            bi_file.flush()
            print('Getting buildconfig from server and store to %s' % bc_filename)
            bc = get_buildconfig(apiurl, prj, repo)
            if not bc_file:
                bc_file = open(bc_filename, 'w')
            bc_file.write(bc)
            bc_file.flush()
    except HTTPError as e:
        if e.code == 404:
            # check what caused the 404
            if meta_exists(metatype='prj', path_args=(quote_plus(prj), ),
                           template_args=None, create_new=False, apiurl=apiurl):
                pkg_meta_e = None
                try:
                    # take care, not to run into double trouble.
                    pkg_meta_e = meta_exists(metatype='pkg', path_args=(quote_plus(prj),
                                        quote_plus(pac)), template_args=None, create_new=False,
                                        apiurl=apiurl)
                except:
                    pass

                if pkg_meta_e:
                    print('ERROR: Either wrong repo/arch as parameter or a parse error of .spec/.dsc/.kiwi file due to syntax error', file=sys.stderr)
                else:
                    print('The package \'%s\' does not exist - please ' \
                                        'rerun with \'--local-package\'' % pac, file=sys.stderr)
            else:
                print('The project \'%s\' does not exist - please ' \
                                    'rerun with \'--alternative-project <alternative_project>\'' % prj, file=sys.stderr)
            sys.exit(1)
        else:
            raise

    bi = Buildinfo(bi_filename, apiurl, build_type, list(prefer_pkgs.keys()))

    if bi.debuginfo and not (opts.disable_debuginfo or '--debug' in buildargs):
        buildargs.append('--debug')

    if opts.release:
        bi.release = opts.release

    if bi.release:
        buildargs.append('--release=%s' % bi.release)

    if opts.build_opt:
        buildargs += opts.build_opt

    # real arch of this machine
    # vs.
    # arch we are supposed to build for
    if bi.hostarch != None:
        if hostarch != bi.hostarch and not bi.hostarch in can_also_build.get(hostarch, []):
            print('Error: hostarch \'%s\' is required.' % (bi.hostarch), file=sys.stderr)
            return 1
    elif hostarch != bi.buildarch:
        if not bi.buildarch in can_also_build.get(hostarch, []):
            # OBSOLETE: qemu_can_build should not be needed anymore since OBS 2.3
            if vm_type != "emulator" and not bi.buildarch in qemu_can_build:
                print('Error: hostarch \'%s\' cannot build \'%s\'.' % (hostarch, bi.buildarch), file=sys.stderr)
                return 1
            print('WARNING: It is guessed to build on hostarch \'%s\' for \'%s\' via QEMU.' % (hostarch, bi.buildarch), file=sys.stderr)

    rpmlist_prefers = []
    if prefer_pkgs:
        print('Evaluating preferred packages')
        for name, path in prefer_pkgs.items():
            if bi.has_dep(name):
                # We remove a preferred package from the buildinfo, so that the
                # fetcher doesn't take care about them.
                # Instead, we put it in a list which is appended to the rpmlist later.
                # At the same time, this will make sure that these packages are
                # not verified.
                bi.remove_dep(name)
                rpmlist_prefers.append((name, path))
                print(' - %s (%s)' % (name, path))

    print('Updating cache of required packages')

    urllist = []
    if not opts.download_api_only:
        # transform 'url1, url2, url3' form into a list
        if 'urllist' in config:
            if isinstance(config['urllist'], str):
                re_clist = re.compile('[, ]+')
                urllist = [ i.strip() for i in re_clist.split(config['urllist'].strip()) ]
            else:
                urllist = config['urllist']

        # OBS 1.5 and before has no downloadurl defined in buildinfo
        if bi.downloadurl:
            urllist.append(bi.downloadurl + '/%(extproject)s/%(extrepository)s/%(arch)s/%(filename)s')
    if opts.disable_cpio_bulk_download:
        urllist.append( '%(apiurl)s/build/%(project)s/%(repository)s/%(repoarch)s/%(repopackage)s/%(repofilename)s' )

    fetcher = Fetcher(cache_dir,
                      urllist = urllist,
                      api_host_options = config['api_host_options'],
                      offline = opts.noinit or opts.offline,
                      http_debug = config['http_debug'],
                      enable_cpio = not opts.disable_cpio_bulk_download,
                      cookiejar=cookiejar)

    if not opts.trust_all_projects:
        # implicitly trust the project we are building for
        check_trusted_projects(apiurl, [ i for i in bi.projects.keys() if not i == prj ])

    imagefile = ''
    imagesource = ''
    imagebins = []
    if (not config['no_preinstallimage'] and not opts.nopreinstallimage and
        bi.preinstallimage and
        not opts.noinit and not opts.offline and
        (opts.clean or (not os.path.exists(build_root + "/installed-pkg") and
                        not os.path.exists(build_root + "/.build/init_buildsystem.data")))):
        (imagefile, imagesource, imagebins) = get_preinstall_image(apiurl, arch, cache_dir, bi.preinstallimage)
        if imagefile:
            # remove binaries from build deps which are included in preinstall image
            for i in bi.deps:
                if i.name in imagebins:
                    bi.remove_dep(i.name)

    # now update the package cache
    fetcher.run(bi)

    old_pkg_dir = None
    if opts.oldpackages:
        old_pkg_dir = opts.oldpackages
        if not old_pkg_dir.startswith('/') and not opts.offline:
            data = [ prj, pacname, repo, arch]
            if old_pkg_dir == '_link':
                p = osc.core.findpacs(os.curdir)[0]
                if not p.islink():
                    raise oscerr.WrongOptions('package is not a link')
                data[0] = p.linkinfo.project
                data[1] = p.linkinfo.package
                repos = osc.core.get_repositories_of_project(apiurl, data[0])
                # hack for links to e.g. Factory
                if not data[2] in repos and 'standard' in repos:
                    data[2] = 'standard'
            elif old_pkg_dir != '' and old_pkg_dir != '_self':
                a = old_pkg_dir.split('/')
                for i in range(0, len(a)):
                    data[i] = a[i]

            destdir = os.path.join(cache_dir, data[0], data[2], data[3])
            old_pkg_dir = None
            try:
                print("Downloading previous build from %s ..." % '/'.join(data))
                binaries = get_binarylist(apiurl, data[0], data[2], data[3], package=data[1], verbose=True)
            except Exception as e:
                print("Error: failed to get binaries: %s" % str(e))
                binaries = []

            if binaries:
                class mytmpdir:
                    """ temporary directory that removes itself"""
                    def __init__(self, *args, **kwargs):
                        self.name = mkdtemp(*args, **kwargs)
                    _rmtree = staticmethod(shutil.rmtree)
                    def cleanup(self):
                        self._rmtree(self.name)
                    def __del__(self):
                        self.cleanup()
                    def __exit__(self):
                        self.cleanup()
                    def __str__(self):
                        return self.name

                old_pkg_dir = mytmpdir(prefix='.build.oldpackages', dir=os.path.abspath(os.curdir))
                if not os.path.exists(destdir):
                    os.makedirs(destdir)
            for i in binaries:
                fname = os.path.join(destdir, i.name)
                os.symlink(fname, os.path.join(str(old_pkg_dir), i.name))
                if os.path.exists(fname):
                    st = os.stat(fname)
                    if st.st_mtime == i.mtime and st.st_size == i.size:
                        continue
                get_binary_file(apiurl,
                                data[0],
                                data[2], data[3],
                                i.name,
                                package = data[1],
                                target_filename = fname,
                                target_mtime = i.mtime,
                                progress_meter = True)

        if old_pkg_dir != None:
            buildargs.append('--oldpackages=%s' % old_pkg_dir)

    # Make packages from buildinfo available as repos for kiwi
    if build_type == 'kiwi':
        if os.path.exists('repos'):
            shutil.rmtree('repos')
        os.mkdir('repos')
        for i in bi.deps:
            if not i.extproject:
                # remove
                bi.deps.remove(i)
                continue
            # project
            pdir = str(i.extproject).replace(':/', ':')
            # repo
            rdir = str(i.extrepository).replace(':/', ':')
            # arch
            adir = i.repoarch
            # project/repo
            prdir = "repos/"+pdir+"/"+rdir
            # project/repo/arch
            pradir = prdir+"/"+adir
            # source fullfilename
            sffn = i.fullfilename
            filename = sffn.split("/")[-1]
            # target fullfilename
            tffn = pradir+"/"+filename
            if not os.path.exists(os.path.join(pradir)):
                os.makedirs(os.path.join(pradir))
            if not os.path.exists(tffn):
                print("Using package: "+sffn)
                if opts.linksources:
                    os.link(sffn, tffn)
                else:
                    os.symlink(sffn, tffn)
            if prefer_pkgs:
                for name, path in prefer_pkgs.items():
                    if name == filename:
                        print("Using prefered package: " + path + "/" + filename)
                        os.unlink(tffn)
                        if opts.linksources:
                            os.link(path + "/" + filename, tffn)
                        else:
                            os.symlink(path + "/" + filename, tffn)
        # Is a obsrepositories tag used?
        try:
            tree = ET.parse(build_descr)
        except:
            print('could not parse the kiwi file:', file=sys.stderr)
            print(open(build_descr).read(), file=sys.stderr)
            sys.exit(1)
        root = tree.getroot()
        # product
        for xml in root.findall('instsource'):
            if xml.find('instrepo').find('source').get('path') == 'obsrepositories:/':
                print("obsrepositories:/ for product builds is not yet supported in osc!")
                sys.exit(1)
        # appliance
        expand_obsrepos=None
        for xml in root.findall('repository'):
            if xml.find('source').get('path') == 'obsrepositories:/':
                expand_obsrepos=True
        if expand_obsrepos:
          buildargs.append('--kiwi-parameter')
          buildargs.append('--ignore-repos')
          for xml in root.findall('repository'):
              if xml.find('source').get('path') == 'obsrepositories:/':
                  for path in bi.pathes:
                      if not os.path.isdir("repos/"+path):
                          continue
                      buildargs.append('--kiwi-parameter')
                      buildargs.append('--add-repo')
                      buildargs.append('--kiwi-parameter')
                      buildargs.append("dir://./repos/"+path)
                      buildargs.append('--kiwi-parameter')
                      buildargs.append('--add-repotype')
                      buildargs.append('--kiwi-parameter')
                      buildargs.append('rpm-md')
                      if xml.get('priority'):
                          buildargs.append('--kiwi-parameter')
                          buildargs.append('--add-repoprio='+xml.get('priority'))
              else:
                   m = re.match(r"obs://[^/]+/([^/]+)/(\S+)", xml.find('source').get('path'))
                   if not m:
                       # short path without obs instance name
                       m = re.match(r"obs://([^/]+)/(.+)", xml.find('source').get('path'))
                   project=m.group(1).replace(":", ":/")
                   repo=m.group(2)
                   buildargs.append('--kiwi-parameter')
                   buildargs.append('--add-repo')
                   buildargs.append('--kiwi-parameter')
                   buildargs.append("dir://./repos/"+project+"/"+repo)
                   buildargs.append('--kiwi-parameter')
                   buildargs.append('--add-repotype')
                   buildargs.append('--kiwi-parameter')
                   buildargs.append('rpm-md')
                   if xml.get('priority'):
                       buildargs.append('--kiwi-parameter')
                       buildargs.append('--add-repopriority='+xml.get('priority'))

    if vm_type == "xen" or vm_type == "kvm" or vm_type == "lxc":
        print('Skipping verification of package signatures due to secure VM build')
    elif bi.pacsuffix == 'rpm':
        if opts.no_verify:
            print('Skipping verification of package signatures')
        else:
            print('Verifying integrity of cached packages')
            verify_pacs(bi)
    elif bi.pacsuffix == 'deb':
        if opts.no_verify or opts.noinit:
            print('Skipping verification of package signatures')
        else:
            print('WARNING: deb packages get not verified, they can compromise your system !')
    else:
        print('WARNING: unknown packages get not verified, they can compromise your system !')

    for i in bi.deps:
        if i.hdrmd5:
            from .util import packagequery
            hdrmd5 = packagequery.PackageQuery.queryhdrmd5(i.fullfilename)
            if not hdrmd5:
                print("Error: cannot get hdrmd5 for %s" % i.fullfilename)
                sys.exit(1)
            if hdrmd5 != i.hdrmd5:
                print("Error: hdrmd5 mismatch for %s: %s != %s" % (i.fullfilename, hdrmd5, i.hdrmd5))
                sys.exit(1)

    print('Writing build configuration')

    if build_type == 'kiwi':
        rpmlist = [ '%s %s\n' % (i.name, i.fullfilename) for i in bi.deps if not i.noinstall ]
    else:
        rpmlist = [ '%s %s\n' % (i.name, i.fullfilename) for i in bi.deps ]
    for i in imagebins:
        rpmlist.append('%s preinstallimage\n' % i)
    rpmlist += [ '%s %s\n' % (i[0], i[1]) for i in rpmlist_prefers ]

    if imagefile:
        rpmlist.append('preinstallimage: %s\n' % imagefile)
    if imagesource:
        rpmlist.append('preinstallimagesource: %s\n' % imagesource)

    rpmlist.append('preinstall: ' + ' '.join(bi.preinstall_list) + '\n')
    rpmlist.append('vminstall: ' + ' '.join(bi.vminstall_list) + '\n')
    rpmlist.append('runscripts: ' + ' '.join(bi.runscripts_list) + '\n')
    if build_type != 'kiwi' and bi.noinstall_list:
        rpmlist.append('noinstall: ' + ' '.join(bi.noinstall_list) + '\n')
    if build_type != 'kiwi' and bi.installonly_list:
        rpmlist.append('installonly: ' + ' '.join(bi.installonly_list) + '\n')

    rpmlist_file = NamedTemporaryFile(prefix='rpmlist.')
    rpmlist_filename = rpmlist_file.name
    rpmlist_file.writelines(rpmlist)
    rpmlist_file.flush()

    subst = { 'repo': repo, 'arch': arch, 'project' : prj, 'package' : pacname }
    vm_options = []
    # XXX check if build-device present
    my_build_device = ''
    if config['build-device']:
        my_build_device = config['build-device'] % subst
    else:
        # obs worker uses /root here but that collides with the
        # /root directory if the build root was used without vm
        # before
        my_build_device = build_root + '/img'

    need_root = True
    if vm_type:
        if config['build-swap']:
            my_build_swap = config['build-swap'] % subst
        else:
            my_build_swap = build_root + '/swap'

        vm_options = [ '--vm-type=%s' % vm_type ]
        if vm_telnet:
            vm_options += [ '--vm-telnet=' + vm_telnet ]
        if vm_memory:
            vm_options += [ '--memory=' + vm_memory ]
        if vm_type != 'lxc':
            vm_options += [ '--vm-disk=' + my_build_device ]
            vm_options += [ '--vm-swap=' + my_build_swap ]
            vm_options += [ '--logfile=%s/.build.log' % build_root ]
            if vm_type == 'kvm':
                if os.access(build_root, os.W_OK) and os.access('/dev/kvm', os.W_OK):
                    # so let's hope there's also an fstab entry
                    need_root = False
                if config['build-kernel']:
                    vm_options += [ '--vm-kernel=' + config['build-kernel'] ]
                if config['build-initrd']:
                    vm_options += [ '--vm-initrd=' + config['build-initrd'] ]

            build_root += '/.mount'

        if config['build-vmdisk-rootsize']:
            vm_options += [ '--vmdisk-rootsize=' + config['build-vmdisk-rootsize'] ]
        if config['build-vmdisk-swapsize']:
            vm_options += [ '--vmdisk-swapsize=' + config['build-vmdisk-swapsize'] ]
        if config['build-vmdisk-filesystem']:
            vm_options += [ '--vmdisk-filesystem=' + config['build-vmdisk-filesystem'] ]
        if config['build-vm-user']:
            vm_options += [ '--vm-user='******'build-vm-user'] ]


    if opts.preload:
        print("Preload done for selected repo/arch.")
        sys.exit(0)

    print('Running build')
    cmd = [ config['build-cmd'], '--root='+build_root,
                    '--rpmlist='+rpmlist_filename,
                    '--dist='+bc_filename,
                    '--arch='+bi.buildarch ]
    cmd += specialcmdopts + vm_options + buildargs
    cmd += [ build_descr ]

    if need_root:
        sucmd = config['su-wrapper'].split()
        if sucmd[0] == 'su':
            if sucmd[-1] == '-c':
                sucmd.pop()
            cmd = sucmd + ['-s', cmd[0], 'root', '--' ] + cmd[1:]
        else:
            cmd = sucmd + cmd

    # change personality, if needed
    if hostarch != bi.buildarch and bi.buildarch in change_personality:
        cmd = [ change_personality[bi.buildarch] ] + cmd

    try:
        rc = run_external(cmd[0], *cmd[1:])
        if rc:
            print()
            print('The buildroot was:', build_root)
            sys.exit(rc)
    except KeyboardInterrupt as i:
        print("keyboard interrupt, killing build ...")
        cmd.append('--kill')
        run_external(cmd[0], *cmd[1:])
        raise i

    pacdir = os.path.join(build_root, '.build.packages')
    if os.path.islink(pacdir):
        pacdir = os.readlink(pacdir)
        pacdir = os.path.join(build_root, pacdir)

    if os.path.exists(pacdir):
        (s_built, b_built) = get_built_files(pacdir, bi.buildtype)

        print()
        if s_built: print(s_built)
        print()
        print(b_built)

        if opts.keep_pkgs:
            for i in b_built.splitlines() + s_built.splitlines():
                shutil.copy2(i, os.path.join(opts.keep_pkgs, os.path.basename(i)))

    if bi_file:
        bi_file.close()
    if bc_file:
        bc_file.close()
    rpmlist_file.close()
Example #58
0
def _set_network_proxy():
    if conf.proxy:
        debug_msg = "setting the HTTP/SOCKS proxy for all network requests"
        logger.debug(debug_msg)

        try:
            _ = urlsplit(conf.proxy)
        except Exception as ex:
            err_msg = "invalid proxy address '{0}' ('{1}')".format(
                conf.proxy, str(ex))
            raise PocsuiteSyntaxException(err_msg)

        hostname_port = _.netloc.split(":")
        scheme = _.scheme.upper()
        hostname = hostname_port[0]
        port = None
        username = None
        password = None

        if len(hostname_port) == 2:
            try:
                port = int(hostname_port[1])
            except Exception:
                pass

        if not all((scheme, hasattr(PROXY_TYPE, scheme), hostname, port)):
            err_msg = "proxy value must be in format '({0})://address:port'".format(
                "|".join(_[0].lower()
                         for _ in get_public_type_members(PROXY_TYPE)))
            raise PocsuiteSyntaxException(err_msg)

        if conf.proxy_cred:
            _ = re.search(r"\A(.*?):(.*?)\Z", conf.proxy_cred)
            if not _:
                err_msg = "proxy authentication credentials "
                err_msg += "value must be in format username:password"
                raise PocsuiteSyntaxException(err_msg)
            else:
                username = _.group(1)
                password = _.group(2)

        if scheme in (PROXY_TYPE.SOCKS4, PROXY_TYPE.SOCKS5,
                      PROXY_TYPE.SOCKS5H):
            socks.set_default_proxy(
                socks.PROXY_TYPE_SOCKS4
                if scheme == PROXY_TYPE.SOCKS4 else socks.PROXY_TYPE_SOCKS5,
                hostname,
                port,
                username=username,
                password=password,
                rdns=True if scheme == PROXY_TYPE.SOCKS5H else False,
            )
            socket.socket = socks.socksocket
            conf.proxies = {
                "http": conf.proxy,
                "https": conf.proxy,
            }
        else:
            if conf.proxy_cred:
                proxy_string = "{0}@".format(conf.proxy_cred)
            else:
                proxy_string = ""

            proxy_string = "{0}{1}:{2}".format(proxy_string, hostname, port)
            conf.proxies = {"http": proxy_string, "https": proxy_string}
def make_abs_url(url):
    pr = parse.urlsplit(url)
    return parse.urlunsplit(parse.SplitResult(pr.scheme, pr.netloc, path.abspath(pr.path), '',''))
Example #60
0
from urllib import parse

url = 'https://www.baidu.com/s?wd=python'
result = parse.urlparse(url)
print(result)
result = parse.urlsplit(url)
print(result)