def _run_copy(project_url, dest_path): save_website( url=project_url, project_folder=dest_path, bypass_robots=True, )
def downloadWebPage(url, dirDownload): """ Downloads whole webpage from the specified url. url: The base url to download. dirDownload: Directory path where the webpage should be downloaded to. """ beginTime = datetime.now() kwargs = { "bypass_robots": True, "project_name": "recognisable-name", "load_css": False, "load_images": False, "load_javascript": False } try: save_website(url=url, project_folder=dirDownload, **kwargs) except: print("Downloading webpage from '{0}' failed".format(url)) print("Total run time taken by script: {0}".format(datetime.now() - beginTime))
from pywebcopy import save_website kwargs = {'project_name': 'website downloaded'} save_website(url='https://www.tutorialspoint.com', project_folder='', **kwargs)
c = urllib2.urlopen(page) except: print("Could not open %s" % page) continue soup = BeautifulSoup(c.read()) links = soup('a') #finding all the sub_links for link in links: if 'href' in dict(link.attrs): url = urljoin(page, link['href']) if url.find("'") != -1: continue url = url.split('#')[0] if url[0:4] == 'http': indexed_url.append(url) pages = indexed_url return indexed_url pagelist = ["https://en.wikipedia.org/wiki/Python_%28programming_language%29"] urls = crawl(pagelist, depth=1) # print( urls ) from pywebcopy import save_website url = 'http://www.reirab.com/comp551.html' download_folder = os.getcwd() kwargs = {'bypass_robots': True, 'project_name': 'PyWebCopy'} save_website(url, download_folder, **kwargs)
from pywebcopy import save_website kwargs = {'project_name': 'wdm'} save_website( url='https://rednoise.org/teaching/wdm/', project_folder='./downloads', **kwargs )
#!pip install pywebcopy from pywebcopy import save_website print("Which website you want to mirror?:\n") _url = input() print("And where do you want to save it? Insert the path to the folder:\n") _project_folder = input() save_website( url = _url, project_folder = _project_folder )
# Importar save_website de pywebcopy from pywebcopy import save_website # Asignar ciento ajustes de descarga kwargs = {'project_name':'some-fancy-name'} # Indicar URL y carpeta de destino save_website( url='https://hackertyper.net/') project_folder="path/to/downloads", **kwargs )
''' Copy full Website, Test ''' from pywebcopy import save_website, save_webpage save_webpage( url='http://example.com/', project_folder='webpage/', ) save_website( url='example.com/', project_folder='website/', )
"""Usando una libreria para clonar sitios web""" import os from pywebcopy import save_website BASE_DIR = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(BASE_DIR, 'web_clonadas') kwargs = {'project_name': 'usac'} save_website(url='https://portal.ingenieria.usac.edu.gt/', project_folder=path, **kwargs)
def pars(self) -> None: save_website(url=self.__url, project_folder=self.__path, zip_project_folder=False) create_zip(self.__path, self.__id)
from pywebcopy import save_website import logging import os # get rid of logging logging.basicConfig() # download website website = input('URL for website to be copied: \n') # set a directory to work in directory = os.getcwd() # set project name name = input('What do you want to name the folder with the information?\n') # save the webpage (uncomment later, it is already saved) save_website(url=website, project_folder=directory, project_name=name) # get a list of the files and append to set html_files = [] scraped_website_folder = f"{directory}/{name}"
#!/usr/bin/env python3 from pywebcopy import save_website kwargs = {'project_name': 'webmirrors'} save_website( url='http://help.sumologic.com/', project_folder='/var/tmp/', **kwargs )
servers of the site and rarely could be illegal, so check everything before you proceed. choose method and uncomment the method which you like. ''' # method 1: ''' pywebcopy.config.setup_config(project_url='http://localhost:5000/', 'project_folder='e://tests/', project_name='LocalHost') crawler = pywebcopy.Crawler('http://localhost:5000/') crawler.crawl() ''' # method 2: ''' pywebcopy.save_website(page_url, download_folder) ''' # pywebcopy.save_webpage(page_url, download_folder) # pywebcopy.WebPage(page_url, html).save_html('e://tests//index.html') # wp = pywebcopy.webpage.WebPage() # wp.url = 'http://localhost:5000' # wp.get('http://google.com/') # wp.set_source(handle) # pywebcopy.config.setup_config(wp.url, download_folder, 'LocalHost') # wp.save_complete() ''' for thread in threading.enumerate(): if thread == threading.main_thread(): continue else:
from pywebcopy import save_website import globals save_website( url= "http://www.stqc.gov.in", project_name="target_website", project_folder='./', )
from pywebcopy import save_website dir_name = 'login' site_url = 'http://brandio.io/envato/iofrm/html/' kwargs = {'project_name': dir_name} save_website(url=site_url, project_folder=dir_name, **kwargs)
sys.exit(1) if len(args) == 2: print("Saving {!r} in {!r}".format(args[1], os.getcwd())) save_webpage(args[1], os.getcwd()) elif len(args) == 4 and args[2] == '-d': print("Saving {!r} in {!r}".format(args[1], args[3])) save_webpage(args[1], args[3]) else: print_usage() sys.exit(1) elif args[0] == '-c': if len(args) < 2: print_usage() sys.exit(1) if len(args) == 2: print("Saving {!r} in {!r}".format(args[1], os.getcwd())) save_website(args[1], os.getcwd()) elif len(args) == 4 and args[2] == '-d': print("Saving {!r} in {!r}".format(args[1], args[3])) save_website(args[1], args[3]) else: print_usage() sys.exit(1)
if args.location and not isinstance(args.location, six.string_types): parser.error("--location option requires 1 string type argument") if args.name and not isinstance(args.name, six.string_types): parser.error("--name option requires 1 string type argument") if args.page: save_webpage( url=args.url, project_folder=args.location, bypass_robots=args.bypass_robots, open_in_browser=args.pop, debug=not args.quite, delay=args.delay, threaded=args.threaded, ) elif args.site: save_website( url=args.url, project_folder=args.location, bypass_robots=args.bypass_robots, open_in_browser=args.pop, debug=not args.quite, delay=args.delay, threaded=args.threaded, ) elif args.tests: os.system('%s -m unittest discover -s pywebcopy/tests' % sys.executable) else: parser.print_help() sys.exit(1)