def schedule_jobs(self, task):
     domains = {}
     for query in task['queries']:
         domain = util.extract_domain(query['url'])
         if domain not in domains:
             domains[domain] = []
         domains[domain].append(query)
     result = []
     for domain, jobs in domains.iteritems():
         l = len(jobs)
         if l < 2:
             result.append(jobs)
         elif l < 4:
             part = int(l / 2)
             result.append(jobs[0:part])
             result.append(jobs[part:])
         else:
             part = int(l / 4)
             for i in range(0, 4):
                 if i < 3:
                     result.append(jobs[part * i:part * (i + 1)])
                 else:
                     result.append(jobs[part * i:])
     # current just give each domain to one worker
     return result
Beispiel #2
0
    def is_page_internal(self, url=None):
        if not url:
            url = self.url

        if self.base_domain not in extract_domain(url):
            return False
        return True
def phantom_js(query, base_headers, timeout):
    arguments = {
        "url": query['url'],
        "domain": util.extract_domain(query['url']),
        "headers": base_headers,
        "timeout": timeout,
    }
    used_cookies = None
    options = query.get('options', {})
    if options.get('login', False):
        resource_name = options['source']
        cookies = resource_manager.get_resource(resource_name)
        arguments['cookies'] = cookies['cookies']
        used_cookies = cookies

    try:
        infd, in_path = tempfile.mkstemp()
        # close output file first
        outfd, out_path = tempfile.mkstemp()
        os.fdopen(outfd).close
        in_params = os.fdopen(infd, 'wb')

        #Pass Arguments
        try:
            in_params.write(json.dumps(arguments))
        finally:
            in_params.close()
        time_left = const.RENDER_TIMEOUT_LIMIT

        RENDER_PATH = os.path.join(const.BASE_DIR, 'agent', 'render.js')

        proc = subprocess.Popen(['phantomjs', RENDER_PATH, in_path, out_path, '--disk-cache=yes', '--max-disk-cache-size 100000', '--web-security=no'])

        while proc.poll() is None:
            time.sleep(const.UPDATE_INTERVAL)
            time_left -= const.UPDATE_INTERVAL
            if time_left < 0:
                proc.kill()
                raise RenderTimeout()
        if proc.returncode != 0:
            raise NoZeroError()
        ret = None
        #Read return data
        with open(out_path) as f:
            ret = json.loads(f.read())

        if used_cookies:
            ret['resource_id'] = used_cookies['id']
        ret['state'] = 'ok'
        return ret
    finally:
        os.unlink(in_path)
        os.unlink(out_path)
 def __init__(self, start_url, sitemap_url=None):
     self.visited_urls = set()
     self.intermediate_urls = set()
     self.logger = logging.getLogger(__name__)
     self.base_domain = extract_domain(start_url)
     self.base_site = extract_base_site(start_url)
     self.non_visited_urls = {
         _get_client_page(start_url, None, start_url, self.base_domain, DOMAINS_TO_BE_SKIPPED)}
     self.added_count = 1
     self.idle_ping = 0
     # self.coop = task.Cooperator()
     self.start_idle_counter = False
     self.sitemap_url = '{}/sitemap.xml'.format(self.base_site) if not sitemap_url else sitemap_url
    def __init__(self, start_url, sitemap_url=None, max_concurrent_connections=MAX_CONCURRENT_REQUESTS_PER_SERVER):

        self.visited_urls = set()
        self.intermediate_urls = set()
        self.base_domain = extract_domain(start_url)
        self.base_site = extract_base_site(start_url)
        self.base_page = _get_client_page(start_url, None, start_url, self.base_domain, DOMAINS_TO_BE_SKIPPED)
        self.non_visited_urls = {self.base_page}
        self.added_count = 1
        self.idle_ping = 0
        self.start_idle_counter = False
        self.sitemap_url = u'{}/sitemap.xml'.format(self.base_site) if not sitemap_url else sitemap_url
        self.max_concurrent_connections = max_concurrent_connections

        self.page_queue = JoinableQueue()
        self.semaphore = BoundedSemaphore(self.max_concurrent_connections)
        self.start = time.time()
        self.skip_count = 0