Ejemplo n.º 1
0
    def download(self, url):
        scheme = urlparse(url)[0]
        ext = url[url.rfind("."):]
        urlpath = urlparse(url)[2]
        filename = unquote(urlpath.split("/")[-1])

        self.using_temp_file = True

        if scheme == 's3':
            client = boto3.client('s3')
            bucket_name, key = re.compile('s3://([\w\d\-\.]+)/(.*)').search(url).groups()
            url = client.generate_presigned_url(
                'get_object',
                Params={'Bucket': bucket_name, 'Key': key.replace("+", " ")}
            )

        src = urlopen(url)
        dest_fd, self.path = tempfile.mkstemp(suffix=ext)
        try:
            with os.fdopen(dest_fd, 'wb') as dest:
                shutil.copyfileobj(src, dest)
        except:
            os.remove(self.path)
        finally:
            src.close()

        return filename
Ejemplo n.º 2
0
def validate_url(url, parent_url='http:'):

    """
    Validate a URL to be a string having an explicit recognized scheme.

    Arguments:
        url: string URL
        parent_url: optional string URL from which to inherit an implicit
                    scheme.

    Returns: dict having:
        valid: boolean truth value.
        url: string modified URL.
    """

    if bytes == type(url):
        url = url.decode()

    parsed_url = urlparse(url)

    if 0 < len(parsed_url.path) and '/' == parsed_url.path[0]:
        url = urldefrag(urljoin(parent_url, url))[0]

    elif not parsed_url.scheme:
        parent_scheme = urlparse(parent_url).scheme or 'http'
        url = parent_scheme + ':' + url

    parsed_url = urlparse(url)

    valid = parsed_url.scheme in ('http', 'https', '') and \
            bool(parsed_url.netloc)

    return {'valid': valid, 'url': url}
Ejemplo n.º 3
0
 def test_path_ended_with_pound(self):
     url = '//example.com:8042/over/there#name=ferret'
     a = fetch.urlparse(url).path
     b = request.urlparse(url).path
     self.assertEqual(a, b)
     a = fetch.urlparse(url).query
     b = request.urlparse(url).query
     self.assertEqual(a, b)
Ejemplo n.º 4
0
 def test_no_netloc(self):
     url = 'www.example.com'
     a = fetch.urlparse(url).netloc
     b = request.urlparse(url).netloc
     self.assertEqual(a, b)
     a = fetch.urlparse(url).path
     b = request.urlparse(url).path
     self.assertEqual(a, b)
Ejemplo n.º 5
0
 def test_path_with_query_and_fragment(self):
     url = '//example.com:8042/over/there?name=ferret#nose'
     a = fetch.urlparse(url).path
     b = request.urlparse(url).path
     self.assertEqual(a, b)
     a = fetch.urlparse(url).query
     b = request.urlparse(url).query
     self.assertEqual(a, b)
     a = fetch.urlparse(url).fragment
     b = request.urlparse(url).fragment
     self.assertEqual(a, b)
Ejemplo n.º 6
0
 def gather_hyperlinks(self, url):
     """ Opens a url and searches its content for hyperlinks. 
         url = [ParseResult], the url to open and read
         returns [list(ParseResult)] list of found hyperlinks """
     links = []
     with request.urlopen(url.geturl()) as page:
         dom = lxml.html.fromstring(page.read().decode("utf-8"))
         self.host_requests[url.netloc] = time.time()
         for link in dom.xpath("//a/@href"):
             new_link = request.urlparse(link)
             if not new_link.scheme:
                 new_link = request.urlparse(url.scheme + "://" + url.netloc + new_link.path)
             if new_link.scheme == "http":
                 links.append(new_link)
     return links
Ejemplo n.º 7
0
def upload(dist_filename, dist_type, package, config, sign=False):
    schema, netloc, url, params, query, fragments = urlparse(config.repository)
    if params or query or fragments:
        raise InvalidRepository("Incompatible url %s" % config.repository)

    if schema not in ('http', 'https'):
        raise InvalidRepository("unsupported schema " + schema)

    if sign:
        raise NotImplementedError()

    data = build_upload_post_data(dist_filename, dist_type, package)
    userpass = (config.username + ":" + config.password).encode("ascii")
    auth = six.b("Basic ") + base64.standard_b64encode(userpass)
    request = build_request(config.repository, data, auth)

    try:
        result = urlopen(request)
        status = result.getcode()
        reason = result.msg
    except HTTPError:
        e = extract_exception()
        status = e.code
        reason = e.msg

    if status != 200:
        raise PyPIError(
                "Could not upload to repository %r - error %s (server answered '%s')" \
                % (config.repository, status, reason))
 def __init__(self, endpoint, server=None, port=None, use_srv=True, wait=80,
         hold=4, requests=5, headers=None, PIPELINE=True, GZIP=True):
     PlugIn.__init__(self)
     self.DBG_LINE = 'bosh'
     self._exported_methods = [
         self.send, self.receive, self.disconnect,
     ]
     url = urlparse(endpoint)
     self._http_host = url.hostname
     self._http_path = url.path
     if url.port:
         self._http_port = url.port
     elif url.scheme == 'https':
         self._http_port = 443
     else:
         self._http_port = 80
     self._http_proto = url.scheme
     self._server = server
     self._port = port
     self.use_srv = use_srv
     self.Sid = None
     self._rid = 0
     self.wait = 80
     self.hold = hold
     self.requests = requests
     self._pipeline = None
     self.PIPELINE = PIPELINE
     if self.PIPELINE:
         self._respobjs = []
     else:
         self._respobjs = {}
     self.headers = headers or self.default_headers
     self.GZIP = GZIP
Ejemplo n.º 9
0
def query_splitter(url):
	from attrdict import AttrDict
	from collections import OrderedDict
	query = attrgetter('query')(urlsplit(url))
	dic = urlparse(url)._asdict()
	_query = itemgetter('query')(dic)
	print(OrderedDict([q.split('=') for q in _query.split('&')])['text'])
	return AttrDict(OrderedDict([q.split('=') for q in query.split('&')]))
Ejemplo n.º 10
0
def get_nhlid_from_tablerow(tr):
    """Get player ID from href inside the row"""
    anchor_tag = tr.find(".//a[@href]")

    if anchor_tag is not None:
        href = anchor_tag.attrib['href']
        if re.match(r"^/ice/player.htm", href):
            qs = urlparse(href).query
            return parse_qs(qs).get("id", None)[0]
Ejemplo n.º 11
0
def httpServer(url):
    u = urlparse(url)
    host = u[1]
    page = u[2]
    s = socket.socket()
    port =80
    s.connect((host,port))
    httpcmd = 'get'+page+'\n'
    s.send(httpcmd)
    s.close()
Ejemplo n.º 12
0
 def update_img(ev):
     url = ev.get('img_url', None)
     if url:
         o = urlparse(url)
         fname = os.path.join(res_dir, os.path.basename(o.path))
         if not os.path.exists(res_dir):
             os.mkdir(res_dir)
         if not os.path.exists(fname):
             urlretrieve(url, fname)
         ev['img_cache'] = os.path.join(os.path.basename(res_dir),
                                        os.path.basename(o.path))
     return ev
Ejemplo n.º 13
0
 def _unshorten_hrefli(self, uri):
     try:
         # Extract url from query
         parsed_uri = urlparse(uri)
         extracted_uri = parsed_uri.query
         if not extracted_uri:
             return uri, INVALID_URL_ERROR_CODE
         # Get url status code
         r = requests.head(extracted_uri, headers=self._headers, timeout=self._timeout)
         return r.url, r.status_code
     except Exception as e:
         return uri, str(e)
Ejemplo n.º 14
0
    def set_scheme(self, url):
        """
        Checks if we haven't got a scheme. Sets scheme if needed.

        :param str url: The url address with scheme or without.
        """
        if not request.urlparse(url).scheme:
            if url.startswith('ftp.'):
                url = 'ftp://{0!s}'.format(url)
                self._options['href'] = url
            else:
                url = 'http://{0!s}'.format(url)
                self._options['href'] = url
Ejemplo n.º 15
0
    def test_register_server(self):
        package = PackageDescription(name="foo")
        repository = "http://testpypi.python.org/pypi"
        realm = DEFAULT_REALM
        config = PyPIConfig(username="******", password="******", repository=repository, realm=realm)

        auth = HTTPPasswordMgr()
        host = urlparse(config.repository)[0]
        auth.add_password(config.realm, host, config.username, config.password)

        post_data = build_post_data(package, "submit")
        code, msg = post_to_server(post_data, config, auth)
        self.assertEqual(code, 200)
        self.assertEqual(msg, "OK")
Ejemplo n.º 16
0
 def uri_to_db(self, uri):
     parse_result = urlparse(uri)
     db = {}
     if 'postgres' in parse_result.scheme:
         db['ENGINE'] = 'django.db.backends.postgresql_psycopg2'
         db['NAME'] = os.path.split(parse_result.path)[-1]
         db['HOST'] = parse_result.hostname
         db['USER'] = parse_result.username
         db['PASSWORD'] = parse_result.password
         db['PORT'] = parse_result.port or ''
     else:
         db['ENGINE'] = 'django.db.backends.sqlite3'
         db['NAME'] = os.path.abspath(parse_result.path)
     return db
Ejemplo n.º 17
0
def download_ftp_file(url, outfile):
    url_parsed = urlparse(url)
    assert url_parsed.scheme == 'ftp'
    ftp = FTP(url_parsed.hostname)
    ftp.login()
    with open(outfile, 'wb') as out_f:
        ftp.retrbinary('RETR %s' % url_parsed.path, out_f.write)

    # set the mtime to match remote ftp server
    response = ftp.sendcmd('MDTM ' + url_parsed.path)
    code, lastmodified = response.split()
    # an example: 'last-modified': '20121128150000'
    lastmodified = time.mktime(datetime.strptime(lastmodified, '%Y%m%d%H%M%S').timetuple())
    os.utime(outfile, (lastmodified, lastmodified))
Ejemplo n.º 18
0
def try_safely(remote_url):
    try:
        if urlparse(remote_url):
            try:
                log.info("Attempting to archive url.")
                archive = get(remote_url)
                log.info("Archive Created.")
                return archive
            except HTTPError:
                log.info("Cannot archive object, returning url.")
                return remote_url
    except ValueError as _e:
        log.info("No URL given")
        log.debug(_e)
    log.info("Not a valid URL")
    return remote_url
Ejemplo n.º 19
0
    def run(self, context):
        o, a = context.get_parsed_arguments()
        if o.repository and (o.username or o.password or o.repository_url):
            raise bento.errors.UsageException("Cannot specify repository and username/password/url at the same time")
        if not (o.repository or (o.username or o.password or o.repository_url)):
            # FIXME: why does distutils use DEFAULT_REPOSITORY (i.e. an url)
            # here ?
            config = _read_pypirc(DEFAULT_REPOSITORY)
        elif o.repository:
            config = _read_pypirc(o.repository)
        else:
            config = PyPIConfig(o.username, o.password, o.repository_url)

        auth = HTTPPasswordMgr()
        host = urlparse(config.repository)[1]
        auth.add_password(config.realm, host, config.username, config.password)

        post_data = build_post_data(context.pkg, "submit")
        code, msg = post_to_server(post_data, config, auth)
        if code != 200:
            raise bento.errors.BentoError("Error while submitting package metadata to server: %r" % msg)
Ejemplo n.º 20
0
def downloadImg(img):
    try:
        src = img.attrs['src']
        if not src.startswith("http"):
            print("Ignore img:", src)
            return
        print("Downloading image...:", src)
        resp = requests.request('get', src)
        o = urlparse(src)
        query = parse_qs(o.query)
        save_as = query.get("id")
        if save_as:
            save_as = save_as[0]
        else:
            save_as = os.path.basename(src)
        save_as = "/tmp/" + save_as
        f = open(save_as, 'wb')
        f.write(resp.content)
        return save_as
    except Exception as e:
        print(e)
        pass
Ejemplo n.º 21
0
def facebook(request):
    if request.method == 'POST':
        json_acceptable_string = request.body.decode('utf-8').replace("'", "\"")
        json_data = json.loads(json_acceptable_string)
        try:
            fd = urlopen(json_data['photo_url'])
            image_name = urlparse(json_data['photo_url']).path.split('/')[-1]
            image_file = BytesIO(fd.read())

            try:
                user = ExtUser.objects.get(username=json_data['username'])
                Token.objects.get(user=user)
                return JsonResponse({
                        'message': 'User and token already exist.'
                    })
            except ExtUser.DoesNotExist:
                json_data = check_json_data(json_data)
                new_user = ExtUser.objects.create_user(
                    username=json_data['username'],
                    email=json_data['email'],
                    location=json_data['location'],
                    orientation='S',
                    gender=json_data['gender'][0].upper(),
                    birthday=json_data['birthday'],
                    password=json_data['password'],
                )
                new_user.photo.save(image_name, File(image_file))
                new_user.save()
                token = Token.objects.create(user=new_user)

                if new_user:
                    return JsonResponse({
                            'message': 'User is created. Sign in please.',
                            'token': token.key
                        })

        except Exception as e:
            return HttpResponseBadRequest('Something went wrong.')
    return HttpResponseBadRequest('Only POST request.')
Ejemplo n.º 22
0
def downloadFile(url, download_dir, target_dir_name, sha1_hash = None, force_download = False, user_agent = None):
    if not os.path.isdir(download_dir):
        os.mkdir(download_dir)

    p = urlparse(url)
    url = urlunparse([p[0], p[1], quote(p[2]), p[3], p[4], p[5]]) # replace special characters in the URL path

    filename_rel = os.path.split(p.path)[1] # get original filename
    target_filename = os.path.join(download_dir, filename_rel)

    # check SHA1 hash, if file already exists
    if os.path.exists(target_filename) and sha1_hash is not None and sha1_hash != "":
        hash_file = computeFileHash(target_filename)
        if hash_file != sha1_hash:
            log("Hash of " + target_filename + " (" + hash_file + ") does not match expected hash (" + sha1_hash + "); forcing download")
            force_download = True

    # download file
    if (not os.path.exists(target_filename)) or force_download:
        log("Downloading " + url + " to " + target_filename)
        if p.scheme == "ssh":
            downloadSCP(p.hostname, p.username, p.path, download_dir)
        else:
            if user_agent is not None:
                MyURLOpener.version = user_agent
                MyURLOpener().retrieve(url, target_filename)
            else:
                urlretrieve(url, target_filename)
    else:
        log("Skipping download of " + url + "; already downloaded")

    # check SHA1 hash
    if sha1_hash is not None and sha1_hash != "":
        hash_file = computeFileHash(target_filename)
        if hash_file != sha1_hash:
            raise RuntimeError("Hash of " + target_filename + " (" + hash_file + ") differs from expected hash (" + sha1_hash + ")")

    return target_filename
Ejemplo n.º 23
0
    import argparse
    parser = argparse.ArgumentParser(
        description="Link Extractor Tool with Python")
    parser.add_argument("url", help="The URL to extract links from.")
    parser.add_argument("-m",
                        "--max-urls",
                        help="Number of max URLs to crawl, default is 30.",
                        default=30,
                        type=int)

    args = parser.parse_args()
    url = args.url
    max_urls = args.max_urls

    crawl(url, max_urls=max_urls)

    print("[+] Total Internal links:", len(internal_urls))
    print("[+] Total External links:", len(external_urls))
    print("[+] Total URLs:", len(external_urls) + len(internal_urls))

    domain_name = urlparse(url).netloc

    # save the internal links to a file
    with open(f"{domain_name}_internal_links.txt", "w") as f:
        for internal_link in internal_urls:
            print(internal_link.strip(), file=f)

    # save the external links to a file
    with open(f"{domain_name}_external_links.txt", "w") as f:
        for external_link in external_urls:
            print(external_link.strip(), file=f)
Ejemplo n.º 24
0
def get_domain(url):
    parsed_url = urlparse(url)
    return "{url.netloc}".format(url=parsed_url)
Ejemplo n.º 25
0
 def test_no_scheme_but_port(self):
     url = '//example.com:8042'
     a = fetch.urlparse(url).scheme
     b = request.urlparse(url).scheme
     self.assertEqual(a, b)
Ejemplo n.º 26
0
def read_chunk(url, x0, x1, y0, y1, z0, z1, level=1, format="tiff"):
    """Read an arbitrary chunk of data

    :param url: Base URL of the precomputed data source
    :param x0: starting X coordinate, in the level's coordinate space
    :param x1: ending X coordinate (non-inclusive)
    :param y0: starting Y coordinate
    :param y1: ending Y cooridinate
    :param z0: starting Z coordinate
    :param z1: ending Z coordinate
    :param level: mipmap level
    :param format: the read format if it's a file URL. Defaults to tiff, but
    you can use "blockfs"
    :return: a Numpy array containing the data
    """
    is_file = urlparse(url).scheme.lower() == "file"
    info = get_info(url)
    scale = info.get_scale(level)
    result = np.zeros((z1 - z0, y1 - y0, x1 - x0), info.data_type)
    shape = np.array(scale.shape)
    offset = np.array(scale.offset)
    stride = np.array(scale.chunk_sizes)
    end = offset + shape

    x0d = _chunk_start(x0, offset[0], stride[0])
    x1d = _chunk_end(x1, offset[0], stride[0], end[0])
    y0d = _chunk_start(y0, offset[1], stride[1])
    y1d = _chunk_end(y1, offset[1], stride[1], end[1])
    z0d = _chunk_start(z0, offset[2], stride[2])
    z1d = _chunk_end(z1, offset[2], stride[2], end[2])
    for x0c, y0c, z0c in itertools.product(range(x0d, x1d, stride[0]),
                                           range(y0d, y1d, stride[1]),
                                           range(z0d, z1d, stride[2])):
        x1c = min(x1d, x0c + stride[0])
        y1c = min(y1d, y0c + stride[1])
        z1c = min(z1d, z0c + stride[2])
        chunk_url = url + "/" + scale.key + "/%d-%d_%d-%d_%d-%d" % (
            x0c, x1c, y0c, y1c, z0c, z1c)
        if is_file:
            if format == "tiff":
                chunk_url += ".tiff"
                with urlopen(chunk_url) as fd:
                    chunk = tifffile.imread(fd)
            elif format == "blockfs":
                from blockfs import Directory
                from .blockfs_stack import BlockfsStack
                directory_url = url + "/" + scale.key + "/" +\
                                BlockfsStack.DIRECTORY_FILENAME
                directory_parse = urlparse(directory_url)
                directory_path = os.path.join(directory_parse.netloc,
                                              unquote(directory_parse.path))
                directory = Directory.open(directory_path)
                chunk = directory.read_block(x0c, y0c, z0c)
            elif format == 'ngff':
                group = get_ngff_group_from_url(url)
                key = str(int(np.log2(level)))
                dataset = group[key]
                dataset.read_only = True
                chunk = dataset[0, 0, z0c:z1c, y0c:y1c, x0c:x1c]
            elif format == 'zarr':
                zarr_url = url + "/" + scale.key
                zarr_parse = urlparse(zarr_url)
                zarr_path = os.path.join(zarr_parse.netloc,
                                         unquote(zarr_parse.path))
                storage = zarr.NestedDirectoryStore(zarr_path)
                dataset = zarr.Array(storage)
                chunk = dataset[z0c:z1c, y0c:y1c, x0c:x1c]
            else:
                raise NotImplementedError("Can't read %s yet" % format)
        else:
            response = urlopen(chunk_url)
            data = response.read()
            chunk = np.frombuffer(data, info.data_type).reshape(
                (z1c - z0c, y1c - y0c, x1c - x0c))
        if z0c < z0:
            chunk = chunk[z0 - z0c:]
            z0c = z0
        if z1c > z1:
            chunk = chunk[:z1 - z0c]
            z1c = z1
        if y0c < y0:
            chunk = chunk[:, y0 - y0c:]
            y0c = y0
        if y1c > y1:
            chunk = chunk[:, :y1 - y0c]
            y1c = y1
        if x0c < x0:
            chunk = chunk[:, :, x0 - x0c:]
            x0c = x0
        if x1c > x1:
            chunk = chunk[:, :, :x1 - x0c]
            x1c = x1
        result[z0c - z0:z0c - z0 + chunk.shape[0],
               y0c - y0:y0c - y0 + chunk.shape[1],
               x0c - x0:x0c - x0 + chunk.shape[2]] = chunk
    return result
Ejemplo n.º 27
0
# -*- coding: utf-8 -*-
import scrapy
import re
from mySpider.items import sudaMainItem
import pymysql
import copy
from urllib.request import urlparse
from urllib.parse import urljoin
url = "http://eng.suda.edu.cn/suda_news/sdyw/202002/0c620fb0-aad7-4168-a3a4-a7c07442df98.html"
# 域名
domain = urlparse(url).netloc
# 协议
scheme = urlparse(url).scheme + '://'
print(scheme + domain)


class SudaurlsSpider(scrapy.Spider):
    name = 'sudaurls'
    # allowed_domains = ['www.suda.edu.cn', 'aff.suda.edu.cn', 'eng.suda.edu.cn', 'file.suda.edu.cn',
    #                    'library.suda.edu.cn', 'mail.suda.edu.cn', 'csteaching.suda.edu.cn']
    start_urls = ['http://www.suda.edu.cn']
    basic_url = 'http://www.suda.edu.cn'
    table_count = 0
    url_pool = set()

    def parse(self, response):
        # self.count = self.count+1
        #print('这是第', self.count, '个页面')
        print('当前爬取页面' + response.request.url.strip('*/'))
        print('当前集合大小', len(self.url_pool))
        titles = response.xpath('//a/@href').extract()
Ejemplo n.º 28
0
    def build_url(self, local_path, **kwargs):

        # Make the path relative.
        local_path = local_path.strip('/')

        # We complain when we see non-normalized paths, as it is a good
        # indicator that unsanitized data may be getting through.
        # Mutating the scheme syntax to match is a little gross, but it works
        # for today.
        norm_path = os.path.normpath(local_path)
        if local_path.replace(
                '://', ':/') != norm_path or norm_path.startswith('../'):
            raise ValueError('path is not normalized')

        external = kwargs.pop('external', None) or kwargs.pop(
            '_external', None)
        scheme = kwargs.pop('scheme', None)
        if scheme and not external:
            raise ValueError('cannot specify scheme without external=True')
        if kwargs.get('_anchor'):
            raise ValueError('images have no _anchor')
        if kwargs.get('_method'):
            raise ValueError('images have no _method')

        # Remote URLs are encoded into the query.
        parsed = urlparse(local_path)
        if parsed.scheme or parsed.netloc:
            if parsed.scheme not in ALLOWED_SCHEMES:
                raise ValueError('scheme %r is not allowed' % parsed.scheme)
            kwargs['url'] = local_path
            local_path = '_'  # Must be something.

        # Local ones are not.
        else:
            abs_path = self.find_img(local_path)
            if abs_path:
                kwargs['version'] = encode_int(int(os.path.getmtime(abs_path)))

        # Prep the cache flag, which defaults to True.
        cache = kwargs.pop('cache', True)
        if not cache:
            kwargs['cache'] = ''

        # Prep the enlarge flag, which defaults to False.
        enlarge = kwargs.pop('enlarge', False)
        if enlarge:
            kwargs['enlarge'] = '1'

        # Prep the transform, which is a set of delimited strings.
        transform = kwargs.get('transform')
        if transform:
            if isinstance(transform, basestring):
                transform = re.split(r'[,;:_ ]', transform)
            # We replace delimiters with underscores, and percent with p, since
            # these won't need escaping.
            kwargs['transform'] = '_'.join(
                str(x).replace('%', 'p') for x in transform)

        # Sign the query.
        public_kwargs = ((LONG_TO_SHORT.get(k, k), v)
                         for k, v in kwargs.items()
                         if v is not None and not k.startswith('_'))
        query = urlencode(sorted(public_kwargs), True)
        signer = Signer(current_app.secret_key)
        sig = signer.get_signature('%s?%s' % (local_path, query)).decode()

        url = '%s/%s?%s&s=%s' % (
            current_app.config['IMAGES_URL'],
            urlquote(local_path),
            query,
            sig,
        )

        if external:
            url = '%s://%s%s/%s' % (scheme or request.scheme, request.host,
                                    request.script_root, url.lstrip('/'))

        return url
Ejemplo n.º 29
0
 def _filter(elem):
     parsed = urlparse(elem['href'])
     return bool(parsed.netloc) and bool(
         parsed.scheme) and "rust-lang.org" not in parsed.netloc
Ejemplo n.º 30
0
 def test_content(self):
     self.assertGreater(len(self.json_response), 0)
     parsed_image_url = url_request.urlparse(self.json_response[0])
     self.assertEqual(parsed_image_url.scheme, 'http')
     self.assertEqual(parsed_image_url.netloc, '127.0.0.1:8000')
     self.assertRegex(parsed_image_url.path, r'^/\d+$')
Ejemplo n.º 31
0
 def parse(self) -> urlparse:
     """Returns `urllib.request.urlparse` result for given URL"""
     return urlparse(self.url)
Ejemplo n.º 32
0
    def __init__(self, path=None, url=None, perform_init=True):
        self.stream = None
        file_scheme = "file:"
        self.using_temp_file = False

        if url is not None:
            url = str(url)
            if url.lower().startswith(file_scheme):
                url = url2pathname(url[len(file_scheme):])
                path = url

        self.path = path
        if path is None:
            if url.lower().startswith("omero:"):
                while True:
                    #
                    # We keep trying to contact the OMERO server via the
                    # login dialog until the user gives up or we connect.
                    #
                    try:
                        self.rdr = get_omero_reader()
                        self.path = url
                        if perform_init:
                            self.init_reader()
                        return
                    except jutil.JavaException as e:
                        je = e.throwable
                        if jutil.is_instance_of(
                                je, "loci/formats/FormatException"):
                            je = jutil.call(je, "getCause",
                                            "()Ljava/lang/Throwable;")
                        if jutil.is_instance_of(
                                je, "Glacier2/PermissionDeniedException"):
                            omero_logout()
                            omero_login()
                        else:
                            logger.warn(e.message)
                            for line in traceback.format_exc().split("\n"):
                                logger.warn(line)
                            if jutil.is_instance_of(
                                    je, "java/io/FileNotFoundException"):
                                raise IOError(
                                    errno.ENOENT,
                                    "The file, \"%s\", does not exist." % path,
                                    path)
                            e2 = IOError(
                                errno.EINVAL,
                                "Could not load the file as an image (see log for details)",
                                path.encode('utf-8'))
                            raise e2
            else:
                #
                # Other URLS, copy them to a tempfile location
                #
                ext = url[url.rfind("."):]
                src = urlopen(url)
                dest_fd, self.path = tempfile.mkstemp(suffix=ext)
                try:
                    dest = os.fdopen(dest_fd, 'wb')
                    shutil.copyfileobj(src, dest)
                except:
                    src.close()
                    dest.close()
                    os.remove(self.path)
                self.using_temp_file = True
                src.close()
                dest.close()
                urlpath = urlparse(url)[2]
                filename = unquote(urlpath.split("/")[-1])
        else:
            if sys.platform.startswith("win"):
                self.path = self.path.replace("/", os.path.sep)
            filename = os.path.split(path)[1]

        if not os.path.isfile(self.path):
            raise IOError(errno.ENOENT,
                          "The file, \"%s\", does not exist." % path, path)

        self.stream = jutil.make_instance(
            'loci/common/RandomAccessInputStream', '(Ljava/lang/String;)V',
            self.path)

        self.rdr = None
        class_list = get_class_list()
        find_rdr_script = """
        var classes = class_list.getClasses();
        var rdr = null;
        var lc_filename = java.lang.String(filename.toLowerCase());
        for (pass=0; pass < 3; pass++) {
            for (class_idx in classes) {
                var maybe_rdr = classes[class_idx].newInstance();
                if (pass == 0) {
                    if (maybe_rdr.isThisType(filename, false)) {
                        rdr = maybe_rdr;
                        break;
                    }
                    continue;
                } else if (pass == 1) {
                    var suffixes = maybe_rdr.getSuffixes();
                    var suffix_found = false;
                    for (suffix_idx in suffixes) {
                        var suffix = java.lang.String(suffixes[suffix_idx]);
                        suffix = suffix.toLowerCase();
                        if (lc_filename.endsWith(suffix)) {
                            suffix_found = true;
                            break;
                        }
                    }
                    if (! suffix_found) continue;
                }
                if (maybe_rdr.isThisType(stream)) {
                    rdr = maybe_rdr;
                    break;
                }
            }
            if (rdr) break;
        }
        rdr;
        """
        IFormatReader = make_iformat_reader_class()
        jrdr = jutil.run_script(
            find_rdr_script,
            dict(class_list=class_list, filename=filename, stream=self.stream))
        if jrdr is None:
            raise ValueError("Could not find a Bio-Formats reader for %s",
                             self.path)
        self.rdr = IFormatReader()
        self.rdr.o = jrdr
        if perform_init:
            self.init_reader()
Ejemplo n.º 33
0
def download_file(url, data_dir, resume=True, overwrite=False, verbose=0):
    """ Load requested file if needed or requested.

    Parameters
    ----------
    url: str
        the url of the file to be downloaded.
    data_dir: str
        path of the data directory.
    resume: bool (optional, default True)
        if True, try to resume partially downloaded files
    overwrite: bool (optional, default False)
        if True and file already exists, delete it.
    verbose: int (optional, default 0)
        control the verbosity level.

    Returns
    -------
    download_fname: str
        absolute path to the downloaded file.

    Note: If, for any reason, the download procedure fails, all downloaded
    files are removed.
    """
    # Create the download directory if necessary
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # Determine filename using URL
    parse = urlparse(url)
    fname = os.path.basename(parse.path)

    # Generate the download file name
    download_fname = os.path.join(data_dir, fname)

    # Generate a temporary file for the download
    temp_fname = os.path.join(data_dir, fname + ".part")

    # If the file is already created remove it if the overwrite option is set
    # or return the file
    if os.path.exists(download_fname):
        if overwrite:
            os.remove(download_fname)
        else:
            return download_fname

    # If the temporary file is already created remove it if the overwrite
    # option is set
    if os.path.exists(temp_fname):
        if overwrite:
            os.remove(temp_fname)

    # Start a timer to evaluate the download time
    t0 = time.time()

    # Test if the dataset has been released
    try:
        urlopen(url)
    except:
        raise ValueError(
            "The '{0}' dataset has not been released yet.".format(url))

    # Start downloading dataset
    local_file = None
    bytes_so_far = 0
    try:
        # Prepare the download
        if verbose > 0:
            print("Downloading data from {0}...".format(url))
        # Case 1: continue the downloading from an existing temporary file
        if resume and os.path.exists(temp_fname):
            url_opener = ResumeURLOpener()
            # Download has been interrupted, we try to resume it.
            local_file_size = os.path.getsize(temp_fname)
            # If the file exists, then only download the remainder
            url_opener.addheader("Range", "bytes={0}-".format(local_file_size))
            try:
                data = url_opener.open(url)
            except HTTPError:
                # There is a problem that may be due to resuming
                # Restart the downloading from scratch
                return download_file(url, data_dir, resume=False,
                                     overwrite=False)
            local_file = open(temp_fname, "ab")
            bytes_so_far = local_file_size
        # Case 2: just download the file
        else:
            data = urlopen(url)
            local_file = open(temp_fname, "wb")
        # Get the total file size
        try:
            total_size = data.info().get_all("Content-Length")[0].strip()
            total_size = int(total_size) + bytes_so_far
        except Exception as e:
            if verbose > 0:
                print("Total size could not be determined.")
            total_size = "?"

        # Download data
        chunk_size = 8192
        while True:
            # Read chunk
            chunk = data.read(chunk_size)
            # Stoping criterion
            if not chunk:
                break
            # Write to local file
            bytes_so_far += len(chunk)
            local_file.write(chunk)
            # Write report status and print a progress bar
            if isinstance(total_size, int):
                ratio = float(bytes_so_far) / float(total_size)
            else:
                ratio = 0
            progress_bar(ratio, title=os.path.basename(url))
        print()

        # Temporary file must be closed prior to the move
        if not local_file.closed:
            local_file.close()
        shutil.move(temp_fname, download_fname)

        # Get process duration and print it
        dt = time.time() - t0
        exit_message = ("Download was done in {0} minutes, {1: .2f} "
                        "seconds").format(int(numpy.floor(dt / 60)), dt % 60)
        if verbose > 0:
            print(exit_message)
    except HTTPError as e:
        raise Exception("{0}\nError while downloading file '{1}'. "
                        "Dataset download aborted.".format(e, fname))
    finally:
        # Temporary file must be closed
        if local_file is not None:
            if not local_file.closed:
                local_file.close()

    return download_fname
Ejemplo n.º 34
0
    def __init__(self, path=None, url=None, perform_init=True):
        self.stream = None
        file_scheme = "file:"
        self.using_temp_file = False

        if url is not None:
            url = str(url)
            if url.lower().startswith(file_scheme):
                url = url2pathname(url[len(file_scheme):])
                path = url

        self.path = path
        if path is None:
            if url.lower().startswith("omero:"):
                while True:
                    #
                    # We keep trying to contact the OMERO server via the
                    # login dialog until the user gives up or we connect.
                    #
                    try:
                        self.rdr = get_omero_reader()
                        self.path = url
                        if perform_init:
                            self.init_reader()
                        return
                    except jutil.JavaException as e:
                        je = e.throwable
                        if jutil.is_instance_of(
                            je, "loci/formats/FormatException"):
                            je = jutil.call(je, "getCause",
                                            "()Ljava/lang/Throwable;")
                        if jutil.is_instance_of(
                            je, "Glacier2/PermissionDeniedException"):
                            omero_logout()
                            omero_login()
                        else:
                            logger.warn(e.message)
                            for line in traceback.format_exc().split("\n"):
                                logger.warn(line)
                            if jutil.is_instance_of(
                                je, "java/io/FileNotFoundException"):
                                raise IOError(
                                    errno.ENOENT,
                                    "The file, \"%s\", does not exist." % path,
                                    path)
                            e2 = IOError(
                                errno.EINVAL, "Could not load the file as an image (see log for details)", path.encode('utf-8'))
                            raise e2
            else:
                #
                # Other URLS, copy them to a tempfile location
                #
                ext = url[url.rfind("."):]
                src = urlopen(url)
                dest_fd, self.path = tempfile.mkstemp(suffix=ext)
                try:
                    dest = os.fdopen(dest_fd, 'wb')
                    shutil.copyfileobj(src, dest)
                except:
                    src.close()
                    dest.close()
                    os.remove(self.path)
                self.using_temp_file = True
                src.close()
                dest.close()
                urlpath = urlparse(url)[2]
                filename = unquote(urlpath.split("/")[-1])
        else:
            if sys.platform.startswith("win"):
                self.path = self.path.replace("/", os.path.sep)
            filename = os.path.split(path)[1]

        if not os.path.isfile(self.path):
            raise IOError(
                errno.ENOENT,
                "The file, \"%s\", does not exist." % path,
                path)

        self.stream = jutil.make_instance('loci/common/RandomAccessInputStream',
                                          '(Ljava/lang/String;)V',
                                          self.path)

        self.rdr = None
        class_list = get_class_list()
        find_rdr_script = """
        var classes = class_list.getClasses();
        var rdr = null;
        var lc_filename = java.lang.String(filename.toLowerCase());
        for (pass=0; pass < 3; pass++) {
            for (class_idx in classes) {
                var maybe_rdr = classes[class_idx].newInstance();
                if (pass == 0) {
                    if (maybe_rdr.isThisType(filename, false)) {
                        rdr = maybe_rdr;
                        break;
                    }
                    continue;
                } else if (pass == 1) {
                    var suffixes = maybe_rdr.getSuffixes();
                    var suffix_found = false;
                    for (suffix_idx in suffixes) {
                        var suffix = java.lang.String(suffixes[suffix_idx]);
                        suffix = suffix.toLowerCase();
                        if (lc_filename.endsWith(suffix)) {
                            suffix_found = true;
                            break;
                        }
                    }
                    if (! suffix_found) continue;
                }
                if (maybe_rdr.isThisType(stream)) {
                    rdr = maybe_rdr;
                    break;
                }
            }
            if (rdr) break;
        }
        rdr;
        """
        IFormatReader = make_iformat_reader_class()
        jrdr = jutil.run_script(find_rdr_script, dict(class_list = class_list,
                                                      filename = filename,
                                                      stream = self.stream))
        if jrdr is None:
            raise ValueError("Could not find a Bio-Formats reader for %s", self.path)
        self.rdr = IFormatReader()
        self.rdr.o = jrdr
        if perform_init:
            self.init_reader()
Ejemplo n.º 35
0
 def test_path(self):
     url = '//example.com:8042/over/there'
     a = fetch.urlparse(url).path
     b = request.urlparse(url).path
     self.assertEqual(a, b)
Ejemplo n.º 36
0
def main(argv):
    ssl._create_default_https_context = ssl._create_unverified_context
    
    help_message = 'A Link Scrapper in Python \n\n Usage: python scrapper.py [option] [argument] \n\n -u, --url = url to crawl \n -c, --crawl [on/off]  = turn on or off crawl, default=on \n -f, --file [filepath] = a file path to parse, crawling deactivated in this option  \n -l --lfiles = list of files to parse (each line of the file must be a different file) \n -w --lwebsite = list of websites to check (each line of the file must be a different website), crawling deactivated in this option. If localhost, the crawling is deactivated in this option\n -S --stdin [option] for accept stdin input. Available options are: "f" to pipe the content of an html file,  "p" for a list of files, with "w" a list of websites, example: "cat listofwebsites.txt | python scrapper.py -S w"'
    badargument_message_url = "The only option to be use with -u, --url is --crawl, -c"
    badargument_message_lwebsite = "The only option  -l, --lwebsite is provide a list of websites, shouldnt be used with other parameter"
    badargument_message_stdin = "Stdin cannot be used with this options"
    try:
        opts, args = getopt.getopt(argv, "h:u:c:f:w:S:l:", ['help', 'url=', 'crawl=', 'file=', 'lfiles=', 'stdin=', 'lwebsite='])
    except getopt.GetoptError:
        printandexit(message=help_message)

    port: int = 3000
    crawl: bool = 0
    lwebsite: bool = 0
    lfiles: bool = 0
    urlselected: bool = 0
    fselect: bool = 0
    stdin: bool = 0
    given_url: str = "http://localhost"
    for opt, arg in opts:
        if opt == '-h':                         # help message
            printandexit(message=help_message)
        elif opt in ("-c", "--crawl"):         # activate/deactivate crawling
            if arg == "on":
                crawl = 1
            if arg == "off":
                crawl = 0
        elif opt in ("-f", "--file"):             # File path to parse
            fselect = 1
            crawl = 0                             # There is no crawling here, since there is no domain
            file_path = arg
            fname = ntpath.basename(arg)
            try:
                geturls(url=file_path, domain_name="", crawl=crawl, is_file=1)
            except IOError:
                print("Please choose a valid file path")
                sys.exit()
        elif opt in ("-u", "--url"):                 # url to crawl, decide whether it's a normal website or localhost
            urlselected = 1
            given_url = arg
            domain_name = urlparse(given_url).netloc
        elif opt in ("-S", "--Stdin"):
            stdin = 1
            option = "stdin_file"
            if arg == "f":
                option = "stdin_file"
            if arg == "w":
                option = "lsites"
            if arg == "p":
                option = "path"
        elif opt in ("-w", "--lwebsite"):
            input_file = arg
            lwebsite = 1
        elif opt in ("-l", "--lfiles"):
            input_file = arg
            lfiles = 1
        else:
            print("Parameter not recognized: %s !\n" % opt)
            print(help_message)
    #Options validations
    if (urlselected == 1 and fselect == 1) and (urlselected == 1 and lwebsite == 1) and (urlselected == 1 and stdin == 1):
        printandexit(message=badargument_message_url)
    if (lwebsite == 1 and stdin == 1) and (lwebsite == 1 and fselect == 1):
        printandexit(message=badargument_message_lwebsite)
    if (lfiles == 1 and stdin == 1) and (stdin == 1 and fselect == 1):
        printandexit(message=badargument_message_stdin)
    #Process the selected options
    if lwebsite == 1:
        process_lwebsites(input_file=input_file, crawl=crawl)
    if lfiles == 1:
        crawl = 0
        process_lfiles(input_file=input_file, crawl=crawl, is_file=1)
    if fselect == 1:
        crawl = 0
    if stdin == 1:
        process_stdin(stdin=sys.stdin, option=option, crawl=crawl)
    if urlselected == 1:
        if "localhost" not in given_url:
            geturls(url=given_url, domain_name=domain_name, crawl=crawl, is_file=0)
        else:   # For localhost, it needs to pass the port. Eg: time python3 scrapper.py -u http://localhost:3000
            crawl = 0
            geturls(url=given_url, domain_name=given_url, crawl=crawl, is_file=0)
Ejemplo n.º 37
0
def retrieve_url(url, filename, *, logger=None, uncompress=False, transmit_compressed=True,
                 update=False, check_certificates=True, name=None, timeout=60):
    """Return requested URL in filename

    :param url: the URL to retrive
    :param filename: where to save the contents of the URL
    :param name: string to use to identify the data in status messages
    :param logger: logger instance to use for status and warning messages
    :param uncompress: if true, then uncompress the content
    :param update: if true, then existing file is okay if newer than web version
    :param check_certificates: if true
    :returns: None if an existing file, otherwise the content type
    :raises urllib.request.URLError or EOFError: if unsuccessful

    If 'update' and the filename already exists, fetch the HTTP headers for
    the URL and check the last modified date to see if there is a newer
    version or not.  If there isn't a newer version, return the filename.
    If there is a newer version, or if the filename does not exist,
    save the URL in the filename, and set the file's modified date to
    the HTTP last modified date, and return the filename.
    """
    import os
    import time
    from urllib.request import Request, urlopen, urlparse, URLError
    from chimerax import app_dirs
    from .errors import UserError
    if name is None:
        name = os.path.basename(filename)
    hostname = urlparse(url).hostname
    if _timeout_cache:
        if hostname in _timeout_cache:
            cur_time = time.time()
            prev_time = _timeout_cache[hostname]
            if prev_time + TIMEOUT_CACHE_VALID < cur_time:
                del _timeout_cache[hostname]
            else:
                raise UserError(f'{hostname} failed to respond')
    headers = {"User-Agent": html_user_agent(app_dirs)}
    request = Request(url, unverifiable=True, headers=headers)
    last_modified = None
    if update and os.path.exists(filename):
        if logger:
            logger.status('check for newer version of %s' % name, secondary=True)
        info = os.stat(filename)
        request.method = 'HEAD'
        try:
            with urlopen(request, timeout=timeout) as response:
                d = response.headers['Last-modified']
                last_modified = _convert_to_timestamp(d)
            if last_modified is None and logger:
                logger.warning('Invalid date "%s" for %s' % (d, request.full_url))
            if last_modified is None or last_modified <= info.st_mtime:
                return
        except URLError:
            pass
        request.method = 'GET'
    try:
        request.headers['Accept-encoding'] = 'gzip, identity' if transmit_compressed else 'identity'
        if check_certificates:
            ssl_context = None
        else:
            import ssl
            ssl_context = ssl.create_default_context()
            ssl_context.check_hostname = False
            ssl_context.verify_mode = ssl.CERT_NONE
        with urlopen(request, timeout=timeout, context=ssl_context) as response:
            compressed = uncompress
            ct = response.headers['Content-Type']
            if not compressed:
                ce = response.headers['Content-Encoding']
                if ce:
                    compressed = ce.casefold() in ('gzip', 'x-gzip')
                if ct:
                    compressed = compressed or ct.casefold() in (
                        'application/gzip', 'application/x-gzip')
                    ct = 'application/octet-stream'
            if logger:
                logger.info('Fetching%s %s from %s' % (
                    " compressed" if compressed else "", name,
                    request.get_full_url()))
            d = response.headers['Last-modified']
            last_modified = _convert_to_timestamp(d)
            content_length = response.headers['Content-Length']
            if content_length is not None:
                content_length = int(content_length)
            with open(filename, 'wb') as f:
                if compressed:
                    read_and_uncompress(response, f, name, content_length, logger)
                else:
                    read_and_report_progress(response, f, name, content_length, logger)
        if last_modified is not None:
            os.utime(filename, (last_modified, last_modified))
        if logger:
            logger.status('%s fetched' % name, secondary=True, blank_after=5)
        return ct
    except Exception as err:
        if os.path.exists(filename):
            os.remove(filename)
        if logger:
            logger.status('Error fetching %s' % name, secondary=True, blank_after=15)
        if isinstance(err, URLError) and isinstance(err.reason, TimeoutError):
            _timeout_cache[hostname] = time.time()
            raise UserError(f'{hostname} failed to respond')
        raise
Ejemplo n.º 38
0
 def test_netloc_case(self):
     url = 'foo://EXAMPLE.com'
     a = fetch.urlparse(url).netloc
     b = request.urlparse(url).netloc
     self.assertEqual(a, b)
Ejemplo n.º 39
0
    def handle_request(self, path):

        # Verify the signature.
        query = dict(request.args.items())
        old_sig = query.pop('s', None)
        if not old_sig:
            abort(404)
        signer = Signer(current_app.secret_key)
        new_sig = signer.get_signature(
            '%s?%s' % (path, urlencode(sorted(query.items()), True))).decode()
        if not constant_time_compare(old_sig.encode(), new_sig.encode()):
            abort(404)

        # Expand kwargs.
        query = dict((SHORT_TO_LONG.get(k, k), v) for k, v in query.items())

        remote_url = query.get('url')
        if remote_url:

            # This is redundant for newly built URLs, but not for those which
            # have already been generated and cached.
            parsed = urlparse(remote_url)
            if parsed.scheme not in ALLOWED_SCHEMES:
                abort(404)

            # Download the remote file.
            makedirs(current_app.config['IMAGES_CACHE'])
            path = os.path.join(
                current_app.config['IMAGES_CACHE'],
                hashlib.md5(remote_url).hexdigest() +
                os.path.splitext(parsed.path)[1])

            if not os.path.exists(path):
                log.info('downloading %s' % remote_url)
                tmp_path = path + '.tmp-' + str(os.getpid())
                try:
                    remote_file = urlopen(remote_url).read()
                except HTTPError as e:
                    # abort with remote error code (403 or 404 most times)
                    # log.debug('HTTP Error: %r' % e)
                    abort(e.code)
                else:
                    fh = open(tmp_path, 'wb')
                    fh.write(remote_file)
                    fh.close()
                call(['mv', tmp_path, path])
        else:
            path = self.find_img(path)
            if not path:
                abort(404)  # Not found.

        raw_mtime = os.path.getmtime(path)
        mtime = datetime.datetime.utcfromtimestamp(raw_mtime).replace(
            microsecond=0)
        # log.debug('last_modified: %r' % mtime)
        # log.debug('if_modified_since: %r' % request.if_modified_since)
        if request.if_modified_since and request.if_modified_since >= mtime:
            return '', 304

        mode = query.get('mode')

        transform = query.get('transform')
        transform = re.split(r'[;,_/ ]', transform) if transform else None

        background = query.get('background')
        width = query.get('width')
        width = int(width) if width else None
        height = query.get('height')
        height = int(height) if height else None
        quality = query.get('quality')
        quality = int(quality) if quality else 75
        format = (query.get('format', '') or os.path.splitext(path)[1][1:]
                  or 'jpeg').lower()
        format = {'jpg': 'jpeg'}.get(format, format)
        has_version = 'version' in query
        use_cache = query.get('cache', True)
        enlarge = query.get('enlarge', False)

        sharpen = query.get('sharpen')
        sharpen = re.split(r'[;,_/ ]', sharpen) if sharpen else None

        if use_cache:

            # The parts in this initial list were parameters cached in version 1.
            # In order to avoid regenerating all images when a new feature is
            # added, we append (feature_name, value) tuples to the end.
            cache_key_parts = [
                path, mode, width, height, quality, format, background
            ]
            if transform:
                cache_key_parts.append(('transform', transform))
            if sharpen:
                cache_key_parts.append(('sharpen', sharpen))
            if enlarge:
                cache_key_parts.append(('enlarge', enlarge))

            cache_key = hashlib.md5(repr(
                tuple(cache_key_parts)).encode()).hexdigest()
            cache_dir = os.path.join(current_app.config['IMAGES_CACHE'],
                                     cache_key[:2])
            cache_path = os.path.join(cache_dir, cache_key + '.' + format)
            cache_mtime = os.path.getmtime(cache_path) if os.path.exists(
                cache_path) else None

        mimetype = 'image/%s' % format
        cache_timeout = 31536000 if has_version else current_app.config[
            'IMAGES_MAX_AGE']

        if not use_cache or not cache_mtime or cache_mtime < raw_mtime:

            log.info('resizing %r for %s' % (path, query))
            image = Image.open(path)
            image = self.resize(
                image,
                background=background,
                enlarge=enlarge,
                height=height,
                mode=mode,
                transform=transform,
                width=width,
            )
            image = self.post_process(
                image,
                sharpen=sharpen,
            )

            if not use_cache:
                fh = StringIO()
                image.save(fh, format, quality=quality)
                return fh.getvalue(), 200, [
                    ('Content-Type', mimetype),
                    ('Cache-Control', str(cache_timeout)),
                ]

            makedirs(cache_dir)
            cache_file = open(cache_path, 'wb')
            image.save(cache_file, format, quality=quality)
            cache_file.close()

        return send_file(cache_path,
                         mimetype=mimetype,
                         cache_timeout=cache_timeout)
Ejemplo n.º 40
0
def filterinput(input):
    x = urlparse(input)
    return x.hostname
Ejemplo n.º 41
0
 def netloc(self):
     return '{0.scheme}://{0.netloc}'.format(urlparse(self.orig_url))
Ejemplo n.º 42
0
def getcms(keyword):
    # , language, resPeople
    result = keyWordsCollection.find_one({"originKey": keyword})
    language = result["language"]
    resPeople = result["resPeople"]
    part = result["part"]
    station = result["station"]

    word = words.get(language)
    if not word:
        logging.info("没有适配语言:{}".format(language))
        # 改变关键词获取状态
        updateStatusKeyWord(keyword, part)
        return

    url = "http://api.serpprovider.com/5bfdf4cd7d33d1d77b9875d1/google/en-us/{}/{}".format(
        word, keyword)
    logging.info("请求数据,关键字:{},url:{}".format(keyword, url))
    html = sendRequest(url)  # 请求
    try:
        datas = json.loads(html)
    except Exception as e:
        return
    reslist = jsonpath.jsonpath(datas, "$..res")
    if reslist:
        reslist = reslist[0]
    else:
        logging.error("google搜索后没有数据:{}".format(url))
        # 改变关键词获取状态
        updateStatusKeyWord(keyword, part)
        return
    if not reslist:
        # 改变关键词获取状态
        updateStatusKeyWord(keyword, part)
        logging.error("google搜索后没有数据:{}".format(url))
        return
    for data in reslist:
        # 协议
        scheme = urlparse(data['url']).scheme
        # 域名
        domain = urlparse(data['url']).netloc
        if not scheme or not domain:
            continue
        link = scheme + '://' + domain  # 拼接链接
        # 判断是否在缓存中
        if part == "GB":
            domainList = domainListGB
        else:
            domainList = domainListCL
        if domain in domainList:
            logging.warn("该域名已经获取,存在缓存中,domain:{}".format(domain))
            continue

        # 判断是否在数据库中
        result = googleUrlCollection.find_one({"domain": domain})
        if result:
            logging.warn("该域名已经获取,存在数据库中中,domain:{}".format(domain))
            if result["part"] != part:
                webresultList = list(webResourcescollection.find({"url":
                                                                  link}))
                for result in webresultList:
                    if part == "clothes":
                        result["_id"] = result["_id"].replace(
                            "_GB_", "_clothes_")
                    else:
                        result["_id"] = result["_id"].replace(
                            "_clothes_", "_GB_")
                    try:
                        result["resPeople"] = resPeople
                        result["part"] = part
                        result["station"] = station
                        mongoResult = webResourcescollection.find_one(
                            {"_id": result["_id"]})
                        if not mongoResult:
                            webResourcescollection.insert(result)
                            logging.info("加入成功:{},_id:{}".format(
                                part, result["_id"]))
                    except Exception as e:
                        logging.error(e)

            continue

        # 查询是否在GB中

        title = data['title']  # 获取标题
        describition = data['desc']  # 获取描述
        domainList.append(domain)
        sourceUrl = data["url"]
        insertItem(domain, link, sourceUrl, scheme, keyword, language,
                   resPeople, title, describition, word, part, station)

    # 改变关键词获取状态
    updateStatusKeyWord(keyword, part)
Ejemplo n.º 43
0
def main(argv):
    global BASE_DIR, SRC_DIR, ARCHIVE_DIR, DEBUG_OUTPUT, FALLBACK_URL, USE_TAR, USE_UNZIP
    global TOOL_COMMAND_PYTHON, TOOL_COMMAND_GIT, TOOL_COMMAND_HG, TOOL_COMMAND_SVN, TOOL_COMMAND_PATCH, TOOL_COMMAND_TAR, TOOL_COMMAND_UNZIP

    try:
        opts, args = getopt.getopt(
            argv,
            "ln:N:cCb:h",
            ["list", "name=", "name-file=", "clean", "clean-all", "base-dir", "bootstrap-file=", "local-bootstrap-file=", "use-tar", "use-unzip", "repo-snapshots", "fallback-url=", "force-fallback", "debug-output", "help"])
    except getopt.GetoptError:
        printOptions()
        return 0

    opt_names = []
    name_files = []
    opt_clean = False
    opt_clean_archives = False
    list_libraries = False

    default_bootstrap_filename = "bootstrap.json"
    bootstrap_filename = os.path.abspath(os.path.join(BASE_DIR, default_bootstrap_filename))
    local_bootstrap_filename = ""
    create_repo_snapshots = False
    force_fallback = False

    base_dir_path = ""

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            printOptions()
            return 0
        if opt in ("-l", "--list"):
            list_libraries = True
        if opt in ("-n", "--name"):
            opt_names.append(arg)
        if opt in ("-N", "--name-file"):
            name_files.append(os.path.abspath(arg))
        if opt in ("-c", "--clean"):
            opt_clean = True
        if opt in ("-C", "--clean-all"):
            opt_clean = True
            opt_clean_archives = True
        if opt in ("-b", "--base-dir"):
            base_dir_path = os.path.abspath(arg)
            BASE_DIR = base_dir_path
            SRC_DIR = os.path.join(BASE_DIR, SRC_DIR_BASE)
            ARCHIVE_DIR = os.path.join(BASE_DIR, ARCHIVE_DIR_BASE)
            bootstrap_filename = os.path.join(BASE_DIR, default_bootstrap_filename)
            log("Using " + arg + " as base directory")
        if opt in ("--bootstrap-file",):
            bootstrap_filename = os.path.abspath(arg)
            log("Using main bootstrap file " + bootstrap_filename)
        if opt in ("--local-bootstrap-file",):
            local_bootstrap_filename = os.path.abspath(arg)
            log("Using local bootstrap file " + local_bootstrap_filename)
        if opt in ("--use-tar",):
            USE_TAR = True
        if opt in ("--use-unzip",):
            USE_UNZIP = True
        if opt in ("--repo-snapshots",):
            create_repo_snapshots = True
            log("Will create repository snapshots")
        if opt in ("--fallback-url",):
            FALLBACK_URL = arg
        if opt in ("--force-fallback",):
            force_fallback = True
            log("Using fallback URL to fetch all libraries")
        if opt in ("--debug-output",):
            DEBUG_OUTPUT = True

    if platform.system() is not "Windows":
        # Unfortunately some IDEs do not have a proper PATH environment variable set,
        # so we search manually for the required tools in some obvious locations.
        paths_to_search = os.environ["PATH"].split(":") + ["/usr/local/bin", "/opt/local/bin", "/usr/bin"]
        TOOL_COMMAND_PYTHON = findToolCommand(TOOL_COMMAND_PYTHON, paths_to_search, required = True)
        TOOL_COMMAND_GIT = findToolCommand(TOOL_COMMAND_GIT, paths_to_search, required = True)
        TOOL_COMMAND_HG = findToolCommand(TOOL_COMMAND_HG, paths_to_search, required = True)
        TOOL_COMMAND_SVN = findToolCommand(TOOL_COMMAND_SVN, paths_to_search, required = True)
        TOOL_COMMAND_PATCH = findToolCommand(TOOL_COMMAND_PATCH, paths_to_search, required = True)
        TOOL_COMMAND_TAR = findToolCommand(TOOL_COMMAND_TAR, paths_to_search, required = USE_TAR)
        TOOL_COMMAND_UNZIP = findToolCommand(TOOL_COMMAND_UNZIP, paths_to_search, required = USE_UNZIP)

    if base_dir_path:
        os.chdir(base_dir_path)

    if name_files:
        for name_file in name_files:
            try:
                with open(name_file) as f:
                    opt_names_local = [l for l in (line.strip() for line in f) if l]
                    opt_names_local = [l for l in opt_names_local if l[0] is not '#']
                    opt_names += opt_names_local
                    dlog("Name file contains: " + ", ".join(opt_names_local))
            except:
                log("ERROR: cannot parse name file " + name_file)
                return -1

    if force_fallback and not FALLBACK_URL:
        log("Error: cannot force usage of the fallback location without specifying a fallback URL")
        return -1;

    state_filename = os.path.join(os.path.dirname(os.path.splitext(bootstrap_filename)[0]), \
                                  "." + os.path.basename(os.path.splitext(bootstrap_filename)[0])) \
                     + os.path.splitext(bootstrap_filename)[1]

    dlog("bootstrap_filename = " + bootstrap_filename)
    dlog("state_filename     = " + state_filename)

    # read canonical libraries data
    data = readJSONData(bootstrap_filename)
    if data is None:
        return -1;

    # some sanity checking
    for library in data:
        if library.get('name', None) is None:
            log("ERROR: Invalid schema: library object does not have a 'name'")
            return -1

    # read local libraries data, if available
    local_data = None
    if local_bootstrap_filename:
        local_data = readJSONData(local_bootstrap_filename)

        if local_data is None:
            return -1;

        # some sanity checking
        for local_library in local_data:
            if local_library.get('name', None) is None:
                log("ERROR: Invalid schema: local library object does not have a 'name'")
                return -1

    # merge canonical and local library data, if applicable; local libraries take precedence
    if local_data is not None:
        for local_library in local_data:
            local_name = local_library.get('name', None)
            found_canonical_library = False
            for n, library in enumerate(data):
                name = library.get('name', None)
                if local_name == name:
                    data[n] = local_library # overwrite library
                    found_canonical_library = True
            if not found_canonical_library:
                data.append(local_library)

    if list_libraries:
        listLibraries(data)
        return 0

    sdata = []
    if os.path.exists(state_filename):
        sdata = readJSONData(state_filename)

    # create source directory
    if not os.path.isdir(SRC_DIR):
        log("Creating directory " + SRC_DIR)
        os.mkdir(SRC_DIR)

    # create archive files directory
    if not os.path.isdir(ARCHIVE_DIR):
        log("Creating directory " + ARCHIVE_DIR)
        os.mkdir(ARCHIVE_DIR)

    failed_libraries = []

    for library in data:
        name = library.get('name', None)
        source = library.get('source', None)
        post = library.get('postprocess', None)

        if (opt_names) and (not name in opt_names):
            continue

        lib_dir = os.path.join(SRC_DIR, name)

        dlog("********** LIBRARY " + name + " **********")
        dlog("lib_dir = " + lib_dir + ")")

        # compare against cached state
        cached_state_ok = False
        if not opt_clean:
            for slibrary in sdata:
                sname = slibrary.get('name', None)
                if sname is not None and sname == name and slibrary == library and os.path.exists(lib_dir):
                    cached_state_ok = True
                    break

        if cached_state_ok:
            log("Cached state for " + name + " equals expected state; skipping library")
            continue
        else:
            # remove cached state for library
            sdata[:] = [s for s in sdata if not (lambda s, name : s.get('name', None) is not None and s['name'] == name)(s, name)]

        # create library directory, if necessary
        if opt_clean:
            log("Cleaning directory for " + name)
            if os.path.exists(lib_dir):
                shutil.rmtree(lib_dir)
        if not os.path.exists(lib_dir):
            os.mkdir(lib_dir)

        try:
            # download source
            if source is not None:
                if 'type' not in source:
                    log("ERROR: Invalid schema for " + name + ": 'source' object must have a 'type'")
                    return -1
                if 'url' not in source:
                    log("ERROR: Invalid schema for " + name + ": 'source' object must have a 'url'")
                    return -1
                src_type = source['type']
                src_url = source['url']

                if src_type == "sourcefile":
                    sha1 = source.get('sha1', None)
                    user_agent = source.get('user-agent', None)
                    try:
                        if force_fallback:
                            raise RuntimeError
                        downloadFile(src_url, ARCHIVE_DIR, name, sha1, force_download = opt_clean_archives, user_agent = user_agent)
                        filename_rel = os.path.basename(src_url)
                        shutil.copyfile( os.path.join(ARCHIVE_DIR, filename_rel), os.path.join(lib_dir, filename_rel) )
                    except:
                        if FALLBACK_URL:
                            if not force_fallback:
                                log("WARNING: Downloading of file " + src_url + " failed; trying fallback")

                            p = urlparse(src_url)
                            filename_rel = os.path.split(p.path)[1] # get original filename
                            p = urlparse(FALLBACK_URL)
                            fallback_src_url = urlunparse([p[0], p[1], p[2] + "/" + ARCHIVE_DIR_BASE + "/" + filename_rel, p[3], p[4], p[5]])
                            downloadFile(fallback_src_url, ARCHIVE_DIR, name, sha1, force_download = True)
                            shutil.copyfile( os.path.join(ARCHIVE_DIR, filename_rel), os.path.join(lib_dir, filename_rel) )
                        else:
                            shutil.rmtree(lib_dir)
                            raise
                elif src_type == "archive":
                    sha1 = source.get('sha1', None)
                    user_agent = source.get('user-agent', None)
                    try:
                        if force_fallback:
                            raise RuntimeError
                        downloadAndExtractFile(src_url, ARCHIVE_DIR, name, sha1, force_download = opt_clean_archives, user_agent = user_agent)
                    except:
                        if FALLBACK_URL:
                            if not force_fallback:
                                log("WARNING: Downloading of file " + src_url + " failed; trying fallback")

                            p = urlparse(src_url)
                            filename_rel = os.path.split(p.path)[1] # get original filename
                            p = urlparse(FALLBACK_URL)
                            fallback_src_url = urlunparse([p[0], p[1], p[2] + "/" + ARCHIVE_DIR_BASE + "/" + filename_rel, p[3], p[4], p[5]])
                            downloadAndExtractFile(fallback_src_url, ARCHIVE_DIR, name, sha1, force_download = True)
                        else:
                            raise

                else:
                    revision = source.get('revision', None)

                    archive_name = name + ".tar.gz" # for reading or writing of snapshot archives
                    if revision is not None:
                        archive_name = name + "_" + revision + ".tar.gz"

                    try:
                        if force_fallback:
                            raise RuntimeError
                        cloneRepository(src_type, src_url, name, revision)

                        if create_repo_snapshots:
                            log("Creating snapshot of library repository " + name)
                            repo_dir = os.path.join(SRC_DIR, name)
                            archive_filename = os.path.join(SNAPSHOT_DIR, archive_name)

                            dlog("Snapshot will be saved as " + archive_filename)
                            createArchiveFromDirectory(repo_dir, archive_filename, revision is None)

                    except:
                        if FALLBACK_URL:
                            if not force_fallback:
                                log("WARNING: Cloning of repository " + src_url + " failed; trying fallback")

                            # copy archived snapshot from fallback location
                            p = urlparse(FALLBACK_URL)
                            fallback_src_url = urlunparse([p[0], p[1], p[2] + "/" + SNAPSHOT_DIR_BASE + "/" + archive_name, p[3], p[4], p[5]])
                            dlog("Looking for snapshot " + fallback_src_url + " of library repository " + name)

                            # create snapshots files directory
                            downloadAndExtractFile(fallback_src_url, SNAPSHOT_DIR, name, force_download = True)

                            # reset repository state to particular revision (only using local operations inside the function)
                            cloneRepository(src_type, src_url, name, revision, True)
                        else:
                            raise
            else:
                # set up clean directory for potential patch application
                shutil.rmtree(lib_dir)
                os.mkdir(lib_dir)

            # post-processing
            if post is not None:
                if 'type' not in post:
                    log("ERROR: Invalid schema for " + name + ": 'postprocess' object must have a 'type'")
                    return -1
                if 'file' not in post:
                    log("ERROR: Invalid schema for " + name + ": 'postprocess' object must have a 'file'")
                    return -1
                post_type = post['type']
                post_file = post['file']

                if post_type == "patch":
                    applyPatchFile(post_file, name, post.get('pnum', DEFAULT_PNUM))
                elif post_type == "script":
                    runPythonScript(post_file)
                else:
                    log("ERROR: Unknown post-processing type '" + post_type + "' for " + name)
                    return -1

            # add to cached state
            sdata.append(library)

            # write out cached state
            writeJSONData(sdata, state_filename)
        except:
            log("ERROR: Failure to bootstrap library " + name + " (reason: " + str(sys.exc_info()[0]) + ")")
            traceback.print_exc()
            failed_libraries.append(name)

    if failed_libraries:
        log("***************************************")
        log("FAILURE to bootstrap the following libraries:")
        log(', '.join(failed_libraries))
        log("***************************************")
        return -1

    log("Finished")

    return 0
Ejemplo n.º 44
0
def is_valid(url):
    #Checks whether `url` is a valid URL.
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)
Ejemplo n.º 45
0
def valid_url(url):
    parsed_url = urlparse(url)
    return bool(parsed_url.scheme)
Ejemplo n.º 46
0
 def test_netloc(self):
     url = 'foo://example.com'
     a = fetch.urlparse(url).netloc
     b = request.urlparse(url).netloc
     self.assertEqual(a, b)
Ejemplo n.º 47
0
def is_valid(url):
    ''' Check whether 'url' is a valid URL. '''
    if 'None' in url:
        return False
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)
Ejemplo n.º 48
0
 def modify_urls(self, url_list):
     return [
         self._base_url + urlparse(url).query.split("=")[1].rstrip() + "/"
         for url in url_list
     ]
Ejemplo n.º 49
0
def sanitize_url(url):
    p = urlparse(url)
    url = urlunparse([p[0], p[1], quote(p[2]), p[3], p[4],
                      p[5]])  # quote special characters in the path
    return url
Ejemplo n.º 50
0
if __name__ == '__main__':
    import re
    from urllib.request import urlopen
    from urllib.request import urlparse

    # Скачивание страницы, декодирование из байтов в строку
    text = urlopen(input()).read().decode()

    urls = set()

    # Поиск ссылок в тексте
    for url in re.findall(r'''<a.+href=["'](.+?)['"].*>''',
                          text,
                          flags=re.MULTILINE):
        # Разбор строки url на компоненты
        result = urlparse(url)
        result = result.path if not result.netloc else result.netloc

        # Избавляемся от порта
        if ':' in result:
            result = result.split(':')[0]

        # Избавляемся от относительных ссылок
        if result.startswith('../'):
            continue

        urls.add(result)

    for url in sorted(urls):
        print(url)
Ejemplo n.º 51
0
def getFilename(url):
    a = urlparse(url)
    b = os.path.basename(a.path)
    print(b)
    return b
Ejemplo n.º 52
0
def get_links(page_url):
	host = urlparse(page_url)[1]
	page = download_page(page_url)
	links = extract_links(page)
	return [link for link in links if urlparse(link)[1] == host]
Ejemplo n.º 53
0
# import libraries
from __future__ import print_function
import urllib.request as urllib_request
from bs4 import BeautifulSoup
from builtins import input
from txt2pdf import txt2pdf
book = ""

#testing deafault
#quote_page = 'http://fullbooks.net/a-court-of-mist-and-fury/page-1-1076467.html'

#asks for the url of the book
quote_page = input("Insert the URL on the fisrt page, example:'https://novel22.net/a-court-of-thorns-and-roses/page-1-1076370.html': \n")

#parses the url to get information out
url = urllib_request.urlparse(quote_page)
data = url.path.split("/")

#get the book name out of data
name_of_book = data[1]

#gets the important numbers of the url
info = (data[2].replace(".html","")).split("-")

hash_number = int(info[-1]) -1
page_1 = 1

total_pages = int(input("Insert the total of pages:"))

print("Downloading {}".format((name_of_book.replace("-"," ")).title()))
Ejemplo n.º 54
0
    parser.add_argument("-m",
                        "--max-urls",
                        help="Number of max URLs to crawl, default is 30.",
                        default=30,
                        type=int)

    args = parser.parse_args()
    url = args.url
    max_urls = args.max_urls
    crawl(url, max_urls)
    extract = tldextract.extract(url)

    #print the output: TLD, Domain, Hostname, Path, Links
    print('TLD: ' + extract.suffix)
    if extract.subdomain == '':
        print('Domain: ' + urlparse(url).netloc)
    else:
        print('Domain: ' + extract.domain + '.' + extract.suffix)
    print('Hostname: ' + urlparse(url).netloc)
    print('Path: ' + urlparse(url).path)
    print('LINKS:')
    print('\t' + "Same hostname: ")
    for link in same_host:
        print('\t\t' + link)
    print('\n')
    print('\t' + "Same domain: ")
    for link in same_domain:
        print('\t\t' + link)
    print('\n')
    print('\t' + "Different domain: ")
    for link in different_domain:
Ejemplo n.º 55
0
 def is_valid_url(self, url):  # pylint: disable=no-self-use
     parsed = urlparse(url)
     return bool(parsed.netloc) and bool(parsed.scheme)
Ejemplo n.º 56
0
 def test_netloc_no_scheme(self):
     url = '//example.com'
     a = fetch.urlparse(url).netloc
     b = request.urlparse(url).netloc
     self.assertEqual(a, b)
Ejemplo n.º 57
0
def is_valid(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)
Ejemplo n.º 58
0
    'User-Agent':
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}

colorama.init()

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

print(pyfiglet.figlet_format("Makdi", font='whimsy'))
print(
    "The program is configured to send the discovered links to the proxy, posting minimal output to stdout \n\n"
)

get_domain = str(input("Input a URL with http/https prepended: "))

domain = urlparse(get_domain).netloc  #extract domain

# variables that store final and buffer values
ilink = []
elink = []
in_links = set()  #stores internal links
ex_links = set()  #stores externanl links through passive discovery


def valid_url():
    # url validation
    try:
        global ip
        initial = requests.get(get_domain, headers=headers, verify=False)
        print("Connected. Proceeding further \n\n")
        crawl(get_domain)
Ejemplo n.º 59
0
def getQueryContent(alamatURL, strQuery):
    
    parsed = urlparse(alamatURL)
    QueryContent = str(urllib.parse.parse_qs(parsed.query)[strQuery][0])
    #QueryContent = str(urlparse.parse_qs(parsed.query)[strQuery][0])
    return QueryContent
Ejemplo n.º 60
0
def process(in_file, out_file):
    http = urllib3.PoolManager()
    url = request.pathname2url(in_file)
    mimetype = mimetypes.guess_type(url)[0]
    basename = ntpath.basename(in_file)

    with open(in_file, mode='rb') as fp:
        file_data = fp.read()

    r = http.request('POST',
                     UPLOAD_URL,
                     fields={'files': (basename, file_data, mimetype)})
    json_txt = r.data.decode('utf-8')
    json_obj = json.loads(json_txt)
    width = json_obj['dim']['cols']
    height = json_obj['dim']['rows']

    file_id = json_obj['id']
    im = Image.open(in_file)
    im = im.convert('RGB')
    draw = ImageDraw.Draw(im)

    width_ratio = im.width / width
    height_ratio = im.height / height

    for i in range(json_obj['balloonCount']):
        ballon = json_obj[str(i)]
        ballon_url = request.urlparse(ballon['originalURL'])
        fname = ntpath.basename(ballon_url.path)
        r = http.request('POST',
                         TRANSLATE_URL,
                         fields={
                             'fname': fname,
                             'id': file_id,
                             'lang': 'ja'
                         })
        json_translated = json.loads(r.data.decode('utf-8'))
        translatedText = json_translated['translatedText']
        if not translatedText:
            continue

        boundingRect = ballon['boundingRect']
        x0 = boundingRect['x']
        y0 = boundingRect['y']
        x1 = x0 + boundingRect['width']
        y1 = y0 + boundingRect['height']

        textRectCount = ballon['textRectCount']

        for rect in range(textRectCount):
            textRect = ballon['textRect'][str(rect)]
            x0 = math.floor(width_ratio * textRect['x'] + .5)
            y0 = math.floor(height_ratio * textRect['y'] + .5)
            x1 = x0 + math.floor(width_ratio * textRect['width'] + .5)
            y1 = y0 + math.floor(height_ratio * textRect['height'] + .5)
            draw.rectangle((x0, y0, x1, y1), fill=(255, 255, 255))

        currentTextRect = 0
        textRect = ballon['textRect'][str(currentTextRect)]
        target_x = math.floor(textRect['x'] * width_ratio + .5)
        target_y = math.floor(textRect['y'] * height_ratio + .5)
        target_width = math.floor(textRect['width'] * width_ratio + .5)
        target_height = math.floor(textRect['height'] * height_ratio + .5)
        start_x = target_x + target_width
        start_y = target_y
        linemaxsize = 0

        for ch in translatedText:
            ch_w, ch_h = draw.textsize(ch, spacing=0, font=FONT)
            if linemaxsize < ch_w:
                linemaxsize = ch_w
            if start_y + ch_h > target_y + target_height:
                start_x -= linemaxsize + LINE_SPACING
                start_y = target_y

            if start_x - ch_w < target_x:
                currentTextRect += 1
                if currentTextRect >= textRectCount:
                    break
                textRect = ballon['textRect'][str(currentTextRect)]
                target_x = math.floor(textRect['x'] * width_ratio + .5)
                target_y = math.floor(textRect['y'] * height_ratio + .5)
                target_width = math.floor(textRect['width'] * width_ratio + .5)
                target_height = math.floor(textRect['height'] * height_ratio +
                                           .5)
                start_x = target_x + target_width
                start_y = target_y
                linemaxsize = ch_w

            ch_x = start_x - ch_w
            ch_y = start_y
            start_y += ch_h + SPACING
            draw.text((ch_x, ch_y), ch, font=FONT, fill=(0, 0, 0))

    im.save(out_file)