def schedule_jobs(self, task): domains = {} for query in task['queries']: domain = util.extract_domain(query['url']) if domain not in domains: domains[domain] = [] domains[domain].append(query) result = [] for domain, jobs in domains.iteritems(): l = len(jobs) if l < 2: result.append(jobs) elif l < 4: part = int(l / 2) result.append(jobs[0:part]) result.append(jobs[part:]) else: part = int(l / 4) for i in range(0, 4): if i < 3: result.append(jobs[part * i:part * (i + 1)]) else: result.append(jobs[part * i:]) # current just give each domain to one worker return result
def is_page_internal(self, url=None): if not url: url = self.url if self.base_domain not in extract_domain(url): return False return True
def phantom_js(query, base_headers, timeout): arguments = { "url": query['url'], "domain": util.extract_domain(query['url']), "headers": base_headers, "timeout": timeout, } used_cookies = None options = query.get('options', {}) if options.get('login', False): resource_name = options['source'] cookies = resource_manager.get_resource(resource_name) arguments['cookies'] = cookies['cookies'] used_cookies = cookies try: infd, in_path = tempfile.mkstemp() # close output file first outfd, out_path = tempfile.mkstemp() os.fdopen(outfd).close in_params = os.fdopen(infd, 'wb') #Pass Arguments try: in_params.write(json.dumps(arguments)) finally: in_params.close() time_left = const.RENDER_TIMEOUT_LIMIT RENDER_PATH = os.path.join(const.BASE_DIR, 'agent', 'render.js') proc = subprocess.Popen(['phantomjs', RENDER_PATH, in_path, out_path, '--disk-cache=yes', '--max-disk-cache-size 100000', '--web-security=no']) while proc.poll() is None: time.sleep(const.UPDATE_INTERVAL) time_left -= const.UPDATE_INTERVAL if time_left < 0: proc.kill() raise RenderTimeout() if proc.returncode != 0: raise NoZeroError() ret = None #Read return data with open(out_path) as f: ret = json.loads(f.read()) if used_cookies: ret['resource_id'] = used_cookies['id'] ret['state'] = 'ok' return ret finally: os.unlink(in_path) os.unlink(out_path)
def __init__(self, start_url, sitemap_url=None): self.visited_urls = set() self.intermediate_urls = set() self.logger = logging.getLogger(__name__) self.base_domain = extract_domain(start_url) self.base_site = extract_base_site(start_url) self.non_visited_urls = { _get_client_page(start_url, None, start_url, self.base_domain, DOMAINS_TO_BE_SKIPPED)} self.added_count = 1 self.idle_ping = 0 # self.coop = task.Cooperator() self.start_idle_counter = False self.sitemap_url = '{}/sitemap.xml'.format(self.base_site) if not sitemap_url else sitemap_url
def __init__(self, start_url, sitemap_url=None, max_concurrent_connections=MAX_CONCURRENT_REQUESTS_PER_SERVER): self.visited_urls = set() self.intermediate_urls = set() self.base_domain = extract_domain(start_url) self.base_site = extract_base_site(start_url) self.base_page = _get_client_page(start_url, None, start_url, self.base_domain, DOMAINS_TO_BE_SKIPPED) self.non_visited_urls = {self.base_page} self.added_count = 1 self.idle_ping = 0 self.start_idle_counter = False self.sitemap_url = u'{}/sitemap.xml'.format(self.base_site) if not sitemap_url else sitemap_url self.max_concurrent_connections = max_concurrent_connections self.page_queue = JoinableQueue() self.semaphore = BoundedSemaphore(self.max_concurrent_connections) self.start = time.time() self.skip_count = 0