Example #1
0
 def scrape(self, url, folder, timeout=1):
     config.setup_config(url, folder)
     wp = WebPage()
     wp.get(url)
     # start the saving process
     wp.save_complete()
     # join the sub threads
     for t in wp._threads:
         if t.is_alive():
             t.join(timeout)
     # location of the html file written
     return wp.file_path
    def scrape(url, folder, timeout=1):
        config.setup_config(url, folder)

        wp = WebPage()
        wp.get(url)

        wp.save_complete()
        for t in wp._threads:
            if t.is_alive():
                t.join(timeout)

        return wp.file_path
Example #3
0
#Does Not Work with Wlvpn.com does not download images and .js and .css files
from pywebcopy import WebPage, config
config.setup_config('https://wlvpn.com/', "e:\\Upwork", "Upp")
wp = WebPage()
wp.get('https://wlvpn.com/')
wp.save_complete()
Example #4
0
from pywebcopy import WebPage
import logging
import os

logging.basicConfig()

# set the urls for downloading
sites = [
    'https://www.entredeveloperslab.com/',
    'https://www.entredeveloperslab.com/testimonials',
    'https://www.entredeveloperslab.com/tune-in',
    'https://www.entredeveloperslab.com/wine-juicer'
]

# set a directory to work in
directory = 'home'

os.chdir(directory)

# save the webpage (uncomment later, it is already saved)
for site in sites:
    wp = WebPage()
    wp.get(site)
    wp.save_complete()
Example #5
0
from pywebcopy import WebPage

#URL di base wikipedia
url = 'https://en.wikipedia.org/wiki/'

#cartella di download delle pagine HTML dei cantanti
download_folder = 'C:/Users/Daniele/Desktop/Università/Web Information Retrivial/progetto v2/ricercaCoppie/'

#file txt contente l'elenco dei cantanti
singerfile = 'C:/Users/Daniele/Desktop/Università/Web Information Retrivial/progetto v2/ricercaListe/Singers.txt'

#lista python contenente l'elenco dei cantanti
singerlist = []

#iteratore per scaricare le pagine
i = 0

#flusso di stream dal file txt alla lista python
with open(singerfile, encoding="utf8") as f:
    for line in f:
        currentPlace = line[:-1]
        singerlist.append(currentPlace)
#print(singerlist)

for n in singerlist:
    wp = WebPage(url + singerlist[i], download_folder)
    wp.save_html()
    i = i + 1