def scrape(self, url, folder, timeout=1): config.setup_config(url, folder) wp = WebPage() wp.get(url) # start the saving process wp.save_complete() # join the sub threads for t in wp._threads: if t.is_alive(): t.join(timeout) # location of the html file written return wp.file_path
def scrape(url, folder, timeout=1): config.setup_config(url, folder) wp = WebPage() wp.get(url) wp.save_complete() for t in wp._threads: if t.is_alive(): t.join(timeout) return wp.file_path
#Does Not Work with Wlvpn.com does not download images and .js and .css files from pywebcopy import WebPage, config config.setup_config('https://wlvpn.com/', "e:\\Upwork", "Upp") wp = WebPage() wp.get('https://wlvpn.com/') wp.save_complete()
from pywebcopy import WebPage import logging import os logging.basicConfig() # set the urls for downloading sites = [ 'https://www.entredeveloperslab.com/', 'https://www.entredeveloperslab.com/testimonials', 'https://www.entredeveloperslab.com/tune-in', 'https://www.entredeveloperslab.com/wine-juicer' ] # set a directory to work in directory = 'home' os.chdir(directory) # save the webpage (uncomment later, it is already saved) for site in sites: wp = WebPage() wp.get(site) wp.save_complete()
from pywebcopy import WebPage #URL di base wikipedia url = 'https://en.wikipedia.org/wiki/' #cartella di download delle pagine HTML dei cantanti download_folder = 'C:/Users/Daniele/Desktop/Università/Web Information Retrivial/progetto v2/ricercaCoppie/' #file txt contente l'elenco dei cantanti singerfile = 'C:/Users/Daniele/Desktop/Università/Web Information Retrivial/progetto v2/ricercaListe/Singers.txt' #lista python contenente l'elenco dei cantanti singerlist = [] #iteratore per scaricare le pagine i = 0 #flusso di stream dal file txt alla lista python with open(singerfile, encoding="utf8") as f: for line in f: currentPlace = line[:-1] singerlist.append(currentPlace) #print(singerlist) for n in singerlist: wp = WebPage(url + singerlist[i], download_folder) wp.save_html() i = i + 1