Ejemplo n.º 1
0
 def sitemaper(self):
    from pysitemap import crawler
    self.root_url = self.inputtargeturl+'/' if not self.inputtargeturl.endswith('/') else self.inputtargeturl
 #    self.root_url = 'https://unityfreepaidassets.com/'
    crawler(self.root_url, out_file=os.path.join(self.OSGETCWD,'sitemap.txt' ),out_format='txt')
    with open(os.path.join(self.OSGETCWD,'sitemap.txt' ), 'rb') as target:
        self.all_lines=[x.decode('utf-8') for x in target.readlines()]
    self.read_ref( self.all_lines,'''Downloading HTML files...''')  
Ejemplo n.º 2
0
def sitemap_gen(link, link_key):
    if '--iocp' in sys.argv:
        sys.argv.remove('--iocp')
        logging.info('using iocp')
        el = windows_events.ProactorEventLoop()
        events.set_event_loop(el)
    root_url = link
    crawler(root_url, out_file=f'sitemap_{link_key}.xml')
Ejemplo n.º 3
0
def generate_site_maps(end_date, section_list):

    exclusion_list = [
        d.strftime('%Y/%m/%d') for d in pd.date_range('19500101', end_date)
    ]
    exclusion_list.extend(['?page', ';var'])

    for s in section_list:
        root_url = f"https://www.economist.com/{s}"
        print(f"Processing section : {s} | URL = {root_url}")
        crawler(root_url,
                out_file=f"./output/sitemap_{s}.xml",
                exclude_urls=exclusion_list)

    return ('Site maps generated and saved to the output folder')
import sys
import logging, requests
from pysitemap import crawler
root_url = 'https://pixinvent.com/materialize-material-design-admin-template/html/ltr/vertical-modern-menu-template/'
if __name__ == '__main__':
    if '--iocp' in sys.argv:
        from asyncio import events, windows_events
        sys.argv.remove('--iocp')
        logging.info('using iocp')
        el = windows_events.ProactorEventLoop()
        events.set_event_loop(el)

    # root_url = sys.argv[1]
    crawler(root_url, out_file='sitemap.txt', out_format='txt')


def createsitemap():
    if '--iocp' in sys.argv:
        from asyncio import events, windows_events
        sys.argv.remove('--iocp')
        logging.info('using iocp')
        el = windows_events.ProactorEventLoop()


#         events.set_event_loop(el)
# from pysitemap import crawler
#     root_url = 'https://unityfreepaidassets.com/'
#     crawler(root_url, out_file='sitemap.txt',out_format='txt')
Ejemplo n.º 5
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import logging
from pysitemap import crawler

parser = argparse.ArgumentParser(description='Python SiteMap Crawler')
parser.add_argument('--domain',
                    default="https://www.lipsum.com",
                    help="Crawler Domain")
parser.add_argument('--task', type=int, default=10, help="Task count")

arg = parser.parse_args()

#logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(asctime)s - %(message)s')
#logging.info("Start the crawling process domain: %s", arg.domain)

if __name__ == '__main__':
    # root_url = sys.argv[1]
    crawler(arg.domain, out_file='sitemap.xml', maxtasks=arg.task)
Ejemplo n.º 6
0
import sys
import logging
from pysitemap import crawler

if __name__ == '__main__':
    if '--iocp' in sys.argv:
        from asyncio import events, windows_events
        sys.argv.remove('--iocp')
        logging.info('using iocp')
        el = windows_events.ProactorEventLoop()
        events.set_event_loop(el)

    # root_url = sys.argv[1]
    root_url = 'https://www.franceinter.fr/emissions/very-good-trip/'
    crawler(root_url, out_file='sitemap.xml', exclude_urls=[".pdf", ".jpg", ".zip"])
Ejemplo n.º 7
0
        default=5,
        help=
        '''Duration (in seconds) between retries if a network request ends in failure.'''
    )
    parser.add_argument('--retry-amount',
                        default=5,
                        help='''Number of retry attempts before giving up.''')

    return parser.parse_args()


if __name__ == '__main__':
    ARGS = parse_args()

    if ARGS.iocp:
        from asyncio import events, windows_events
        sys.argv.remove('--iocp')
        logging.info('using iocp')
        el = windows_events.ProactorEventLoop()
        events.set_event_loop(el)

    load = ARGS.load_data

    # root_url = sys.argv[1]
    #root_url = 'https://www.hellofresh.com/recipes'
    root_url = 'https://www.allrecipes.com/'
    crawler(root_url,
            out_file='sitemap.xml',
            batch_size=10000,
            prefix='https://www.allrecipes.com/recipe/',
            load=load)
Ejemplo n.º 8
0
import sys
import logging
from pysitemap import crawler

if __name__ == '__main__':
    if '--iocp' in sys.argv:
        from asyncio import events, windows_events
        sys.argv.remove('--iocp')
        logging.info('using iocp')
        el = windows_events.ProactorEventLoop()
        events.set_event_loop(el)

    # root_url = sys.argv[1]
    root_url = 'https://www.quizlet.com'
    crawler(root_url, out_file='sitemap.xml')
Ejemplo n.º 9
0
#neflix.py
import sys
from datetime import date
import logging
from pysitemap import crawler

if __name__ == '__main__':
    if '--iocp' in sys.argv:
        from asyncio import events, windows_events
        sys.argv.remove('--iocp')
        logging.info('using iocp')
        el = windows_events.ProactorEventLoop()
        events.set_event_loop(el)

    # root_url = sys.argv[1]
    root_url = 'https://www.netflix.com/'
    today = date.today()
    d = today.strftime("%m.%d.%y")
    crawler(root_url, out_file=f'../outputs/netflix{d}.txt', out_format='txt')