Beispiel #1
0
 def scrape(self, url, folder, timeout=1):
     config.setup_config(url, folder)
     wp = WebPage()
     wp.get(url)
     # start the saving process
     wp.save_complete()
     # join the sub threads
     for t in wp._threads:
         if t.is_alive():
             t.join(timeout)
     # location of the html file written
     return wp.file_path
    def scrape(url, folder, timeout=1):
        config.setup_config(url, folder)

        wp = WebPage()
        wp.get(url)

        wp.save_complete()
        for t in wp._threads:
            if t.is_alive():
                t.join(timeout)

        return wp.file_path
Beispiel #3
0
    def crawl(self, project_name):
        kwargs = {
            'project_url': 'https://www.thedailystar.net/',
            'project_folder': 'thedailystar',
            'project_name': project_name,
            'bypass_robots': False,
            'load_css': False,
            'load_images': False,
            'load_javascript': False,
            'over_write': True
        }
        config.setup_config(**kwargs)

        wp = Crawler()
        wp.crawl()
Beispiel #4
0
def crawl(url, folder, timeout=1):

    config.setup_config(url, folder)

    cr = Crawler()
    cr.get(url)

    # start the saving process
    cr.crawl()

    # join the sub threads
    for t in cr._threads:
        if t.is_alive():
            t.join(timeout)

    # location of the html file written
    return cr.file_path
Beispiel #5
0
#Does Not Work with Wlvpn.com does not download images and .js and .css files
from pywebcopy import WebPage, config
config.setup_config('https://wlvpn.com/', "e:\\Upwork", "Upp")
wp = WebPage()
wp.get('https://wlvpn.com/')
wp.save_complete()
Beispiel #6
0
from pywebcopy import Crawler, config

kwargs = {
    'zip_project_folder':
    False,
    'allowed_file_ext': [
        '.html', '.php', '.asp', '.aspx', '.htm', '.xhtml', '.css', '.json',
        '.js', '.xml', '.svg', '.gif', '.ico', '.jpeg', '.pdf', '.jpg', '.png',
        '.ttf', '.eot', '.otf', '.woff', '.woff2', '.pwcf'
    ]
}

config.setup_config(project_url='https://rednoise.org/teaching/wdm/',
                    project_folder='./downloads2',
                    project_name='wdm')

crawler = Crawler()
crawler.crawl()

#allowed_file_ext=['.html', '.php', '.asp', '.aspx', '.htm', '.xhtml', '.css', '.json', '.js', '.xml', '.svg', '.gif', '.ico', '.jpeg', '.pdf', '.jpg', '.png', '.ttf', '.eot', '.otf', '.woff', '.woff2', '.pwcf'],
#'over_write':True,  <- does not worked properly