def update_link(old_link, new_link, site_directory, file): # log message to console new_link = new_link.replace('\\', '/') site_directory = site_directory.replace('\\', '/') log.log("Saving Asset | \"" + old_link + "\" [as] \"" + new_link + "\"", "asset_storage") if site_directory[-1:] == "/": site_directory = site_directory[:-1] # try opening file with read permissions try: index = open(file, 'r', encoding="utf8") # raise errors except Exception as e: raise e # store file content and replace original asset link with local equivalent new_content = index.read().replace(old_link, new_link) new_content = new_content.replace(site_directory, '') # close index file index.close() # reopen with write permissions index = open(file, 'w', encoding="utf8") # replace file content with new version index.write(new_content) # close file index.close() return True
def find_asset_urls(asset_tags): # empyt array to store downloaded asset hrefs, this is checked to ensure assets are not downloaded more than once downloaded_assets = [] asset_urls = [] # loop through all stored tags in asset_tags variable for asset in asset_tags: # loop through attributes and match regex for any known and desired filetype for attr in asset.attrs: # set href variable to false because setting it in the else statement caused errors href = False # store attribute value in variable val = str(asset.get(attr)) # check if attribute value contains known, desirable file extensions (using regex) if re.search( r"\.(ico|png|webP|jpg|jpeg|gif|bmp|js|css|scss|sass|woff|svg|json|pdf|txt)", val): href = val log.log('Found Asset | [' + attr + '] = ' + href, 'found_assets') # if href not equal to false if href != False: # strip all unneccesary chars from href old_ref = href href = aass.purify_url_string(href) log.log( 'Confirmed Asset Purified | [' + old_ref + '] = ' + href, 'confirmed_assets--purified') # add href to downloaded_assets asset_urls.append(href) return asset_urls
def download_from_url(url, file_path, mobile_device): log.log("Downloading URL | \"" + url + "\"", "url_download_attempt") if mobile_device == "false": ua = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0' else: ua = 'Mozilla/5.0 (Linux; <Android Version>; <Build Tag etc.>) AppleWebKit/<WebKit Rev> (KHTML, like Gecko) Chrome/<Chrome Rev> Mobile Safari/<WebKit Rev>' # set user agent to prevent 403 errors opener = urllib.request.build_opener() opener.addheaders = [('User-agent', ua)] urllib.request.install_opener(opener) # try and download asset try: urllib.request.urlretrieve(url, file_path) # raise errors except urllib.error.HTTPError as e: if e.code == 404: log.log("404 Error - File not found | \"" + url + "\"", "error") return False elif e.code == 500: log.log("500 Error - Internal Server Error | \"" + url + "\"", "error") return False else: raise e except urllib.error.URLError as eUrl: log.log("URL Error - | \"" + url + "\"", "URL-Error") return False except Exception as ex: raise ex return True
def build_url(asset_url, base_url): log.log("Received Asset URL | \"" + asset_url + "\"", "asset_url_builder") if re.search(r"^\/\/", asset_url): return 'https:' + asset_url elif re.search(r"^\/", asset_url): return base_url + re.sub(r"^\/", '', asset_url) elif re.search(r"^http", asset_url): return asset_url elif re.search(r"\.\.\/", asset_url): return base_url + re.sub(r"\.\.\/", '', asset_url) elif re.search(r"^[a-z]", asset_url) and asset_url[0:3] != "http": return base_url + re.sub(r"^\/", '', asset_url) else: return "https://" + asset_url
def purify_url_string(url_string): url_string = re.sub(r"^ ", '', url_string) pure_string = re.sub(r"((^\/\/)|((\?|\#| ).*)|url|\(|background-image:|\)|\"|\'|;)", '', url_string).strip() log.log('Link Purification | ' + url_string + ' becomes ' + pure_string , 'purified_links') return pure_string
def determine_root_url(url_str): parsed_uri = urlparse(url_str) root_url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) log.log("Root URL | \"" + root_url + "\"", "root_url") return root_url