def run(conf, env, type): global tasks pool = Threadpool(multiprocessing.cpu_count()) while tasks.get(type): pool.add_task(partial(tasks[type].pop(), conf, env)) pool.wait_completion()
def validate(paths, jobs): """Validates a list of urls using up to N threads. :param paths: a list of HTML files where we search for a-href's :param jobs: numbers of threads used to send I/O requests""" ahref = re.compile(r'<a [^>]*href="([^"]+)"[^>]*>.*?</a>') visited, urls = set(), collections.defaultdict(list) def check(url, path): """A HEAD request to URL. If HEAD is not allowed, we try GET.""" try: get(url, timeout=10) except HTTPError as e: if e.code == 405: try: get(url, path, 'GET', True) except URLError as e: print ' ' + yellow(e.reason), url print white(' -- ' + path) else: print ' ' + red(e.code), url print white(' -- ' + path) except URLError as e: print ' ' + yellow(e.reason), url print white(' -- ' + path) # -- validation for path in paths: with io.open(path, 'r') as fp: data = fp.read() for match in ahref.finditer(data): a = match.group(1) if a.startswith(('http://', 'https://')): if a not in visited: visited.add(a) urls[path].append(a) print print "Trying", blue(len(visited)), "links..." print pool = Threadpool(jobs) for path in urls: for url in urls[path]: pool.add_task(check, *[unescape(url), path]) try: pool.wait_completion() except KeyboardInterrupt: sys.exit(1)
def run(conf, env, options): """Subcommand: ping -- notify external resources via Pingback etc.""" commands.initialize(conf, env) entrylist = [entry for entry in readers.load(conf)[0] if not entry.draft] if options.file: try: entrylist = [filter(lambda e: e.filename == options.file, entrylist)[0]] except IndexError: raise AcrylamidException("no such post!") if options.service == 'twitter': if twitter is None: raise AcrylamidException("'twitter' egg not found") for entry in entrylist if options.all else entrylist[:options.max or 1]: tweet(entry, conf, options.dryrun) return # XXX we should search for actual hrefs not random grepping, but this # requires access to the cache at non-runtime which is unfortunately # not possible yet. patterns = [ r'(?<=\n)\[.*?\]:\s?(https?://.+)$', # referenced markdown r'\[[^\]]+\]\((https?://[^\)]+)\)', # inline markdown r'(?<=\n)\.\.\s+[^:]+:\s+(https?://.+)$', # referenced docutils r'`[^<]+ <(https?://[^>]+)>`_', # inline docutils ] pool = Threadpool(options.jobs) ping = lambda src, dest: pingback(helpers.joinurl(conf['www_root'], src), dest, options.dryrun) for entry in entrylist if options.all else entrylist[:options.max or 1]: for href in sum([re.findall(pat, entry.source, re.M) for pat in patterns], []): pool.add_task(ping, *[entry.permalink, href]) try: pool.wait_completion() except KeyboardInterrupt: sys.exit(1)
def validate(paths, jobs): """Validates a list of urls using up to N threads. :param paths: a list of HTML files where we search for a-href's :param jobs: numbers of threads used to send I/O requests""" ahref = re.compile(r'<a [^>]*href="([^"]+)"[^>]*>.*?</a>') visited, urls = set(), collections.defaultdict(list) def check(url, path): """A HEAD request to URL. If HEAD is not allowed, we try GET.""" try: get(url, timeout=10) except HTTPError as e: if e.code == 405: try: get(url, path, 'GET', True) except URLError as e: print(' ' + yellow(e.reason), url) print(white(' -- ' + path)) else: print(' ' + red(e.code), url) print(white(' -- ' + path)) except URLError as e: print(' ' + yellow(e.reason), url) print(white(' -- ' + path)) # -- validation for path in paths: with io.open(path, 'r', encoding='utf-8') as fp: data = fp.read() for match in ahref.finditer(data): a = match.group(1) if a.startswith(('http://', 'https://')): if a not in visited: visited.add(a) urls[path].append(a) print() print("Trying", blue(len(visited)), "links...") print() pool = Threadpool(jobs) for path in urls: for url in urls[path]: pool.add_task(check, *[unescape(url), path]) try: pool.wait_completion() except KeyboardInterrupt: sys.exit(1)
def run(conf, env, options): """Subcommand: ping -- notify external resources via Pingback etc.""" commands.initialize(conf, env) entrylist = [entry for entry in readers.load(conf)[0] if not entry.draft] if options.file: try: entrylist = [ filter(lambda e: e.filename == options.file, entrylist)[0] ] except IndexError: raise AcrylamidException("no such post!") if options.service == 'twitter': if twitter is None: raise AcrylamidException("'twitter' egg not found") for entry in entrylist if options.all else entrylist[:options. max or 1]: tweet(entry, conf, options.dryrun) return # XXX we should search for actual hrefs not random grepping, but this # requires access to the cache at non-runtime which is unfortunately # not possible yet. patterns = [ r'(?<=\n)\[.*?\]:\s?(https?://.+)$', # referenced markdown r'\[[^\]]+\]\((https?://[^\)]+)\)', # inline markdown r'(?<=\n)\.\.\s+[^:]+:\s+(https?://.+)$', # referenced docutils r'`[^<]+ <(https?://[^>]+)>`_', # inline docutils ] pool = Threadpool(options.jobs) ping = lambda src, dest: pingback(helpers.joinurl(conf['www_root'], src), dest, options.dryrun) for entry in entrylist if options.all else entrylist[:options.max or 1]: for href in sum( [re.findall(pat, entry.source, re.M) for pat in patterns], []): pool.add_task(ping, *[entry.permalink, href]) try: pool.wait_completion() except KeyboardInterrupt: sys.exit(1)
def initialize(conf, env): global pool hooks, blocks = conf.get('hooks', {}), not conf.get('hooks_mt', True) pool = Threadpool(1 if blocks else multiprocessing.cpu_count(), wait=blocks) force = env.options.force normalize = lambda path: path.replace(conf['output_dir'], '') for pattern, action in iteritems(hooks): if isinstance(action, (types.FunctionType, string_types)): event.register( callback=partial(simple, pool, pattern, normalize, action), to=['create', 'update'] if not force else event.events) else: event.register(callback=partial(advanced, pool, pattern, force, normalize, *action), to=event.events) discover([conf.get('HOOKS_DIR', 'hooks/')], lambda x: x)