def scrape(self, url, folder, timeout=1): config.setup_config(url, folder) wp = WebPage() wp.get(url) # start the saving process wp.save_complete() # join the sub threads for t in wp._threads: if t.is_alive(): t.join(timeout) # location of the html file written return wp.file_path
def scrape(url, folder, timeout=1): config.setup_config(url, folder) wp = WebPage() wp.get(url) wp.save_complete() for t in wp._threads: if t.is_alive(): t.join(timeout) return wp.file_path
def crawl(self, project_name): kwargs = { 'project_url': 'https://www.thedailystar.net/', 'project_folder': 'thedailystar', 'project_name': project_name, 'bypass_robots': False, 'load_css': False, 'load_images': False, 'load_javascript': False, 'over_write': True } config.setup_config(**kwargs) wp = Crawler() wp.crawl()
def crawl(url, folder, timeout=1): config.setup_config(url, folder) cr = Crawler() cr.get(url) # start the saving process cr.crawl() # join the sub threads for t in cr._threads: if t.is_alive(): t.join(timeout) # location of the html file written return cr.file_path
#Does Not Work with Wlvpn.com does not download images and .js and .css files from pywebcopy import WebPage, config config.setup_config('https://wlvpn.com/', "e:\\Upwork", "Upp") wp = WebPage() wp.get('https://wlvpn.com/') wp.save_complete()
from pywebcopy import Crawler, config kwargs = { 'zip_project_folder': False, 'allowed_file_ext': [ '.html', '.php', '.asp', '.aspx', '.htm', '.xhtml', '.css', '.json', '.js', '.xml', '.svg', '.gif', '.ico', '.jpeg', '.pdf', '.jpg', '.png', '.ttf', '.eot', '.otf', '.woff', '.woff2', '.pwcf' ] } config.setup_config(project_url='https://rednoise.org/teaching/wdm/', project_folder='./downloads2', project_name='wdm') crawler = Crawler() crawler.crawl() #allowed_file_ext=['.html', '.php', '.asp', '.aspx', '.htm', '.xhtml', '.css', '.json', '.js', '.xml', '.svg', '.gif', '.ico', '.jpeg', '.pdf', '.jpg', '.png', '.ttf', '.eot', '.otf', '.woff', '.woff2', '.pwcf'], #'over_write':True, <- does not worked properly