def sitemaper(self): from pysitemap import crawler self.root_url = self.inputtargeturl+'/' if not self.inputtargeturl.endswith('/') else self.inputtargeturl # self.root_url = 'https://unityfreepaidassets.com/' crawler(self.root_url, out_file=os.path.join(self.OSGETCWD,'sitemap.txt' ),out_format='txt') with open(os.path.join(self.OSGETCWD,'sitemap.txt' ), 'rb') as target: self.all_lines=[x.decode('utf-8') for x in target.readlines()] self.read_ref( self.all_lines,'''Downloading HTML files...''')
def sitemap_gen(link, link_key): if '--iocp' in sys.argv: sys.argv.remove('--iocp') logging.info('using iocp') el = windows_events.ProactorEventLoop() events.set_event_loop(el) root_url = link crawler(root_url, out_file=f'sitemap_{link_key}.xml')
def generate_site_maps(end_date, section_list): exclusion_list = [ d.strftime('%Y/%m/%d') for d in pd.date_range('19500101', end_date) ] exclusion_list.extend(['?page', ';var']) for s in section_list: root_url = f"https://www.economist.com/{s}" print(f"Processing section : {s} | URL = {root_url}") crawler(root_url, out_file=f"./output/sitemap_{s}.xml", exclude_urls=exclusion_list) return ('Site maps generated and saved to the output folder')
import sys import logging, requests from pysitemap import crawler root_url = 'https://pixinvent.com/materialize-material-design-admin-template/html/ltr/vertical-modern-menu-template/' if __name__ == '__main__': if '--iocp' in sys.argv: from asyncio import events, windows_events sys.argv.remove('--iocp') logging.info('using iocp') el = windows_events.ProactorEventLoop() events.set_event_loop(el) # root_url = sys.argv[1] crawler(root_url, out_file='sitemap.txt', out_format='txt') def createsitemap(): if '--iocp' in sys.argv: from asyncio import events, windows_events sys.argv.remove('--iocp') logging.info('using iocp') el = windows_events.ProactorEventLoop() # events.set_event_loop(el) # from pysitemap import crawler # root_url = 'https://unityfreepaidassets.com/' # crawler(root_url, out_file='sitemap.txt',out_format='txt')
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import argparse import logging from pysitemap import crawler parser = argparse.ArgumentParser(description='Python SiteMap Crawler') parser.add_argument('--domain', default="https://www.lipsum.com", help="Crawler Domain") parser.add_argument('--task', type=int, default=10, help="Task count") arg = parser.parse_args() #logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(asctime)s - %(message)s') #logging.info("Start the crawling process domain: %s", arg.domain) if __name__ == '__main__': # root_url = sys.argv[1] crawler(arg.domain, out_file='sitemap.xml', maxtasks=arg.task)
import sys import logging from pysitemap import crawler if __name__ == '__main__': if '--iocp' in sys.argv: from asyncio import events, windows_events sys.argv.remove('--iocp') logging.info('using iocp') el = windows_events.ProactorEventLoop() events.set_event_loop(el) # root_url = sys.argv[1] root_url = 'https://www.franceinter.fr/emissions/very-good-trip/' crawler(root_url, out_file='sitemap.xml', exclude_urls=[".pdf", ".jpg", ".zip"])
default=5, help= '''Duration (in seconds) between retries if a network request ends in failure.''' ) parser.add_argument('--retry-amount', default=5, help='''Number of retry attempts before giving up.''') return parser.parse_args() if __name__ == '__main__': ARGS = parse_args() if ARGS.iocp: from asyncio import events, windows_events sys.argv.remove('--iocp') logging.info('using iocp') el = windows_events.ProactorEventLoop() events.set_event_loop(el) load = ARGS.load_data # root_url = sys.argv[1] #root_url = 'https://www.hellofresh.com/recipes' root_url = 'https://www.allrecipes.com/' crawler(root_url, out_file='sitemap.xml', batch_size=10000, prefix='https://www.allrecipes.com/recipe/', load=load)
import sys import logging from pysitemap import crawler if __name__ == '__main__': if '--iocp' in sys.argv: from asyncio import events, windows_events sys.argv.remove('--iocp') logging.info('using iocp') el = windows_events.ProactorEventLoop() events.set_event_loop(el) # root_url = sys.argv[1] root_url = 'https://www.quizlet.com' crawler(root_url, out_file='sitemap.xml')
#neflix.py import sys from datetime import date import logging from pysitemap import crawler if __name__ == '__main__': if '--iocp' in sys.argv: from asyncio import events, windows_events sys.argv.remove('--iocp') logging.info('using iocp') el = windows_events.ProactorEventLoop() events.set_event_loop(el) # root_url = sys.argv[1] root_url = 'https://www.netflix.com/' today = date.today() d = today.strftime("%m.%d.%y") crawler(root_url, out_file=f'../outputs/netflix{d}.txt', out_format='txt')