Ejemplo n.º 1
0
def update_link(old_link, new_link, site_directory, file):
    # log message to console
    new_link = new_link.replace('\\', '/')
    site_directory = site_directory.replace('\\', '/')
    log.log("Saving Asset | \"" + old_link + "\" [as] \"" + new_link + "\"",
            "asset_storage")
    if site_directory[-1:] == "/":
        site_directory = site_directory[:-1]

    # try opening file with read permissions
    try:
        index = open(file, 'r', encoding="utf8")
    # raise errors
    except Exception as e:
        raise e
    # store file content and replace original asset link with local equivalent
    new_content = index.read().replace(old_link, new_link)
    new_content = new_content.replace(site_directory, '')
    # close index file
    index.close()
    # reopen with write permissions
    index = open(file, 'w', encoding="utf8")
    # replace file content with new version
    index.write(new_content)
    # close file
    index.close()
    return True
Ejemplo n.º 2
0
def find_asset_urls(asset_tags):
    # empyt array to store downloaded asset hrefs, this is checked to ensure assets are not downloaded more than once
    downloaded_assets = []
    asset_urls = []
    # loop through all stored tags in asset_tags variable
    for asset in asset_tags:
        # loop through attributes and match regex for any known and desired filetype
        for attr in asset.attrs:
            # set href variable to false because setting it in the else statement caused errors
            href = False
            # store attribute value in variable
            val = str(asset.get(attr))
            # check if attribute value contains known, desirable file extensions (using regex)
            if re.search(
                    r"\.(ico|png|webP|jpg|jpeg|gif|bmp|js|css|scss|sass|woff|svg|json|pdf|txt)",
                    val):
                href = val
                log.log('Found Asset | [' + attr + '] = ' + href,
                        'found_assets')
            # if href not equal to false
            if href != False:
                # strip all unneccesary chars from href
                old_ref = href
                href = aass.purify_url_string(href)
                log.log(
                    'Confirmed Asset Purified | [' + old_ref + '] = ' + href,
                    'confirmed_assets--purified')
                # add href to downloaded_assets
                asset_urls.append(href)

    return asset_urls
Ejemplo n.º 3
0
def download_from_url(url, file_path, mobile_device):
    log.log("Downloading URL | \"" + url + "\"", "url_download_attempt")
    if mobile_device == "false":
        ua = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0'
    else:
        ua = 'Mozilla/5.0 (Linux; <Android Version>; <Build Tag etc.>) AppleWebKit/<WebKit Rev> (KHTML, like Gecko) Chrome/<Chrome Rev> Mobile Safari/<WebKit Rev>'
    # set user agent to prevent 403 errors
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-agent', ua)]
    urllib.request.install_opener(opener)
    # try and download asset
    try:
        urllib.request.urlretrieve(url, file_path)
    # raise errors
    except urllib.error.HTTPError as e:
        if e.code == 404:
            log.log("404 Error - File not found | \"" + url + "\"", "error")
            return False
        elif e.code == 500:
            log.log("500 Error - Internal Server Error | \"" + url + "\"",
                    "error")
            return False
        else:
            raise e
    except urllib.error.URLError as eUrl:
        log.log("URL Error -  | \"" + url + "\"", "URL-Error")
        return False
    except Exception as ex:
        raise ex

    return True
Ejemplo n.º 4
0
def build_url(asset_url, base_url):
    log.log("Received Asset URL | \"" + asset_url + "\"", "asset_url_builder")
    if re.search(r"^\/\/", asset_url):
        return 'https:' + asset_url
    elif re.search(r"^\/", asset_url):
        return base_url + re.sub(r"^\/", '', asset_url)
    elif re.search(r"^http", asset_url):
        return asset_url
    elif re.search(r"\.\.\/", asset_url):
        return base_url + re.sub(r"\.\.\/", '', asset_url)
    elif re.search(r"^[a-z]", asset_url) and asset_url[0:3] != "http":
        return base_url + re.sub(r"^\/", '', asset_url)
    else:
        return "https://" + asset_url
Ejemplo n.º 5
0
def purify_url_string(url_string):
	url_string = re.sub(r"^ ", '', url_string)
	pure_string = re.sub(r"((^\/\/)|((\?|\#| ).*)|url|\(|background-image:|\)|\"|\'|;)", '', url_string).strip()
	log.log('Link Purification | ' + url_string + ' becomes ' + pure_string , 'purified_links')
	return pure_string
Ejemplo n.º 6
0
def determine_root_url(url_str):
    parsed_uri = urlparse(url_str)
    root_url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
    log.log("Root URL | \"" + root_url + "\"", "root_url")
    return root_url