Ejemplo n.º 1
0
def test_readline_size_zero():
    pr, pw = pipe()

    def writer():
        try:
            print "writing lines..."
            while True:
                pw.write("hello\n")
        except IOError:
            pass
        print "writer bye bye"

    def reader():
        print "reading line of size 0..."
        assert pr.readline(0) == ""
        print "reading line of size 1..."
        assert pr.readline(1) == "h"
        print "reading line normal..."
        assert pr.readline() == "ello\n"
        print "reader close"
        pr.close()
        print "reader bye bye"

    p = Pool()
    p.spawn(reader)
    p.spawn(writer)
    p.join(raise_error=True)
Ejemplo n.º 2
0
    def test_raw_events_queue_consumer_waits_streams(self, _get_all_streams_exhausted, _get_min_timestamp, sleep):
        _get_min_timestamp.side_effect = [5, 5, 6, 7, 8, 9, 10]
        _get_all_streams_exhausted.side_effect = [
            False,
            False,
            False,
            False,
            False,
            True,
            True
        ]
        self.aws.stream_status = {('A', 'B'): self.aws.ACTIVE,
                                  ('A', 'C'): self.aws.EXHAUSTED}
        self.aws.raw_events_queue.put((8, {'message': 'Hello 8'}))
        self.aws.raw_events_queue.put((7, {'message': 'Hello 7'}))
        self.aws.raw_events_queue.put((9, {'message': 'Hello 9'}))
        self.aws.raw_events_queue.put((6, {'message': 'Hello 6'}))

        pool = Pool(size=1)
        pool.spawn(self.aws._raw_events_queue_consumer)
        pool.join()
        self.assertEqual(self.aws.events_queue.get(), 'Hello 6\n')
        self.assertEqual(self.aws.events_queue.get(), 'Hello 7\n')
        self.assertEqual(self.aws.events_queue.get(), 'Hello 8\n')
        self.assertEqual(self.aws.events_queue.get(), 'Hello 9\n')
        self.assertEqual(self.aws.events_queue.get(), NO_MORE_EVENTS)
        self.assertTrue(self.aws.events_queue.empty())

        self.assertEqual(sleep.call_args_list, [call(0.3), call(0.3)])
Ejemplo n.º 3
0
class GServer(ProtoBufRPCServer):
    def __init__(self, host, port, service, poolsize=128):
        self.gpool = Pool(poolsize)
        self.stop_event = Event()
        context = zmq.Context()
        self.port = port
        self.socket = context.socket(zmq.ROUTER)
        self.socket.bind("tcp://%s:%s" % (host, port))
        self.service = service

    def serve_forever(self,):
        while not self.stop_event.is_set():
            try:
                msg = self.socket.recv_multipart()
            except zmq.ZMQError:
                if self.socket.closed:
                    break
                raise e
            self.gpool.spawn(self.handle_request, msg)

    def shutdown(self,):
        self.socket.close()
        self.stop_event.set()

    def handle_request(self, msg):
        assert len(msg) == 3
        (id_, null, request) = msg
        assert null == ''
        response = self.handle(request)
        self.socket.send_multipart([id_, null, response.SerializeToString()])
Ejemplo n.º 4
0
def test_big_data():
    pr, pw = pipe()

    big = "x" * 1024 * 962
    big += "END"
    print "big data size", len(big)

    def writer():
        print "writing..."
        pw.write(big)
        print "writer bye bye"

    def reader():
        print "reading..."
        data = ""
        for x in xrange(len(big) / 4096):
            data += pr.read(4096)
        data += pr.read(len(big) % 4096)
        assert len(data) == len(big)
        assert data[-3:] == "END"
        print "reader bye bye"

    p = Pool()
    p.spawn(reader)
    p.spawn(writer)
    p.join(raise_error=True)
Ejemplo n.º 5
0
 def test_raw_events_queue_consumer_exit_if_exhausted(self):
     self.aws.stream_status = {('A', 'B'): self.aws.EXHAUSTED}
     pool = Pool(size=1)
     pool.spawn(self.aws._raw_events_queue_consumer)
     pool.join()
     self.assertEqual(self.aws.events_queue.get(), NO_MORE_EVENTS)
     self.assertTrue(self.aws.events_queue.empty())
Ejemplo n.º 6
0
def main():
    options, args = parseCommandLine()
    
    bucket_name = 'stamped.com.static.images'
    conn = S3Connection(keys.aws.AWS_ACCESS_KEY_ID, keys.aws.AWS_SECRET_KEY)
    rs = conn.get_all_buckets()
    rs = filter(lambda b: b.name == bucket_name, rs)
    
    if 1 != len(rs):
        utils.log("error finding bucket to warm cache with")
        return
    
    pool   = Pool(64)
    count  = 0
    
    bucket = rs[0]
    result = list(bucket.list(prefix='search/v2/'))
    utils.log("warming %d keys" % len(reslt))
    
    for key in result:
        pool.spawn(_warm, key, options)
        
        count += 1
        if 0 == (count % 100):
            utils.log("warmed %d keys" % count)
    
    pool.join()
Ejemplo n.º 7
0
    def start_subscribers(self):
        processing_queue = gevent.queue.Queue(None)
        subscribers_pool = GPool(self.num_of_processes)
        subscriber_jobs = [subscribers_pool.spawn(self.start_listening_subscriber, self.sockets[x], processing_queue) for x in xrange(self.num_of_processes)]

        processing_pool = GPool(5000)
        processing_jobs = [processing_pool.spawn(self.send_msg_to_user_socket, processing_queue) for x in xrange(5000)]
Ejemplo n.º 8
0
def main():

    """spawn"""    
     
    val = rclient.get('f1')   

    print(val)
    
    pool = Pool(20)
    
    start('f1')
    

    
    
    #loop forever
    while True:
        
        #print( time.time() )        
        pool.spawn(func1)
        #print pool.wait_available()
        print ( pool.free_count() )
        
        #sleep
        gevent.sleep(2)        
Ejemplo n.º 9
0
def concu():
    p = Pool(10)  # 设置并发数为2

    for url in urls:
        p.spawn(down, url)

    p.join()
Ejemplo n.º 10
0
def gevent_case():
    pool = Pool(8)
    pool.map(ping, (range(8),))

    # print(result)
    gevent_list = [gevent.spawn(ping, str(i)) for i in range(10)]
    gevent.joinall(gevent_list)
Ejemplo n.º 11
0
def watch_pr():
    state_key = 'github_pr.last_updated'
    date_format = '%Y-%m-%dT%H:%M:%SZ'

    st = State()
    last_updated = st.get(state_key)
    st.set(state_key, datetime.utcnow().strftime(date_format))

    if last_updated is None:
        # first run
        return
    else:
        last_updated = datetime.strptime(last_updated, date_format)

    watched = defaultdict(list)
    for chat_id in get_module_chat_ids(module_name):
        wl = get_chat_conf(chat_id, module_name, default_options)['watch_list']
        for repo in wl:
            watched[tuple(repo)].append(chat_id)

    if not watched:
        return

    pool = Pool(10)
    res = pool.imap_unordered(partial(get_new_pr, last_updated), watched)

    for prs in res:
        for pr in prs:
            repo = get_repo(pr['base']['repo']['full_name'])
            pr['chat_ids'] = watched[repo]
            yield pr
Ejemplo n.º 12
0
def crawl_listing(addrs):
    pool = Pool(30*len(addrs))
    clients = [zerorpc.Client(addr) for addr in addrs]
    for item in col_cats.find({'leaf': 1, 'num': {'$exists': True}}, fields=['url', 'catstr']):
        pool.spawn(random.choice(clients).crawl_listing, item['url'], item['catstr'])
        progress() 
    pool.join()
Ejemplo n.º 13
0
def process(s):
    global urls
    urls = {}
    # Hackity hack.
    s = s.split('class="attach')[0].split('<')
    s.pop()
    s = '<'.join(s)
    # Cut out bad tags.
    for t in SKIP_TAGS:
        s = re.sub(FLAGS + '\s*<(?P<tag>' + t + ').*?</(?P=tag)>\s*', '', s)
    # Apply simple rules.
    for (k, r) in SIMPLE_RULES:
        s = re.sub(FLAGS + k, r, s)
    # Close tags that should be closed, leave already closed as-is
    for t in CLOSED_TAGS:
        s = re.sub(FLAGS + r'<({0}[^>]*?)/?>'.format(t), r'<\1/>', s)
        # Maybe this is overkill, but why not.
        s = s.replace('</{0}>'.format(t), '')
    # Apply complex rules.
    (s, n) = ntag_re.subn(proctag, s)
    m, n = n, 1
    while n > 0:
        (s, n) = ptag_re.subn(proctag, s)
        m += n
    # Strip out any HTML leftovers.
    s = re.sub('<[^>]+>','',s)
    if m > 0:
        print('Replaced {0} tags'.format(m))
    
    if not args.no_rehost and len(urls) > 0:
        def print_urls(a, b):
            if a != b:
                print('{0} >> {1}'.format(a, b))
        print('Processing {0} URLs...'.format(len(urls)))
        # Rehost images.
        if gevent:
            pool = Pool(POOL_SIZE)
            def fin(h, url):
                def f(g):
                    urls[h] = g.value
                    print_urls(url, g.value)
                return f
            for h, url in urls.iteritems():
                j = pool.spawn(rehost, url, image=True, referer=target_root)
                j.link_value(fin(h, url))
            pool.join()
        else:
            for h, url in urls.iteritems():
                new_url = rehost(url, image=True, referer=target_root)
                urls[h] = new_url
                print_urls(url, new_url)
    # Bring URLs back in places.
    imgs = 0
    for p, url in urls.iteritems():
        if hashurl(url) != p:
            imgs += 1
        s = s.replace(p, urls[p])
    if imgs > 0:
        print('Found and replaced {0} images'.format(imgs))
    return decode_html_entities(s).strip()
Ejemplo n.º 14
0
def test_proxies(proxies, timeout=10, single_url=None, many_urls=None, call_back=None):
    """
    Test proxies, or process html source using callback in the meantime.

    :type proxies: list
    :param proxies:  proxies
    :param timeout: response timeout
    :param single_url: The URL for testing
    :param many_urls: The list of URLs for testing. Pick one of them when perform request.
    :param call_back: Process the html source if status code is 200. callback(url, source)
    :return:
    """

    proxies = set(proxies)
    errors = set()
    pool = Pool(100)

    def test(proxy):
        code = None
        url = random.choice(many_urls) if many_urls is not None else single_url

        start_time = time.time()
        try:
            with gevent.Timeout(seconds=timeout, exception=Exception('[Connection Timeout]')):
                _headers['User-Agent'] = random.choice(_user_agents)

                res = requests.get(url,
                                   proxies={'http': 'http://{}'.format(proxy.strip()),
                                            'https': 'https://{}'.format(proxy.strip())},
                                   headers=_headers
                                   )
                code = res.status_code
                source = res.text

            _log('[Proxy: {:d} {:s}]'.format(code, proxy))

            # 回调
            if source is not None and call_back is not None and code == 200:
                call_back(url, source)

            if code != 200:
                errors.add(proxy)

        except Exception as e:
            # log(e.args)
            errors.add(proxy)

        end_time = time.time()
        escaped = end_time - start_time if code else None

        store_in_db(proxy, escaped=escaped, status_code=code)  # store in db

    for proxy in proxies:
        pool.spawn(test, proxy)
    pool.join()

    proxies = proxies - errors
    _log('[HTTP Proxies] Available:{:d} Deprecated:{:d}'.format(len(proxies), len(errors)))

    return list(proxies)
Ejemplo n.º 15
0
class Task:
    def __init__(self, queue, pool_max=100):
        self.work = None
        self.pool_max = pool_max
        self.pool = Pool(pool_max)
        self.queue = queue

    def initTaskWork(self, func):
        self.work = func

    def start(self):
        while True:
            if not self.queue.empty():
                t = self.queue.pop()
                self.pool.spawn(self.work, *t)
            elif self.pool.free_count() == self.pool.size or self.queue.isLock:
                # print 'queue is empty'
                # print self.pool.free_count(), self.pool.size
                break
            else:
                # print 'queue is empty but...'
                sleep(0)

    def stop(self):
        # 只让进队列,不让出队列
        self.queue.lock(True)
        for item in self.pool:
            self.queue.push(list(item.args))
            # print item
            # self.pool.killone(item)

        # self.pool.kill()
        # print '开始 stop的save'
        self.queue.save()
        self.queue.clear()
Ejemplo n.º 16
0
def parallel_map(func, iterable, args=None, kwargs=None, workers=None):
    """Map func on a list using gevent greenlets.

    :param func: function applied on iterable elements
    :type func: function
    :param iterable: elements to map the function over
    :type iterable: iterable
    :param args: arguments of func
    :type args: tuple
    :param kwargs: keyword arguments of func
    :type kwargs: dict
    :param workers: limit the number of greenlets
                    running in parrallel
    :type workers: int
    """
    if args is None:
        args = ()
    if kwargs is None:
        kwargs = {}
    if workers is not None:
        pool = Pool(workers)
    else:
        pool = Group()
    iterable = [pool.spawn(func, i, *args, **kwargs) for i in iterable]
    pool.join(raise_error=True)
    for idx, i in enumerate(iterable):
        i_type = type(i.get())
        i_value = i.get()
        if issubclass(i_type, BaseException):
            raise i_value
        iterable[idx] = i_value
    return iterable
Ejemplo n.º 17
0
def postcommit_after_request(response, base_status_error_code=500):
    if response.status_code >= base_status_error_code:
        _local.postcommit_queue = OrderedDict()
        _local.postcommit_celery_queue = OrderedDict()
        return response
    try:
        if postcommit_queue():
            number_of_threads = 30  # one db connection per greenlet, let's share
            pool = Pool(number_of_threads)
            for func in postcommit_queue().values():
                pool.spawn(func)
            pool.join(timeout=5.0, raise_error=True)  # 5 second timeout and reraise exceptions

        if postcommit_celery_queue():
            if settings.USE_CELERY:
                for task_dict in postcommit_celery_queue().values():
                    task = Signature.from_dict(task_dict)
                    task.apply_async()
            else:
                for task in postcommit_celery_queue().values():
                    task()

    except AttributeError as ex:
        if not settings.DEBUG_MODE:
            logger.error('Post commit task queue not initialized: {}'.format(ex))
    return response
Ejemplo n.º 18
0
    def test_wait(self): 
        p = Pool()
        w = p.spawn(self.run_wait)
        f = p.spawn(self.fire_event)
        p.join()

        assert w.value, 'Event not fired in while it was waited.'
Ejemplo n.º 19
0
    def test_cancel(self):
        p = Pool()
        w = p.spawn(self.run_cancel)
        f = p.spawn(self.fire_event)
        p.join()

        assert w.value, 'Event fired while it was canceled.'
Ejemplo n.º 20
0
    def validate_character(self, server_id, character_name, linkshell_names):

        results = []

        pool = Pool()
        # Finds all linkshell URLs

        def find_linkshell_url(linkshell):
            return linkshell, self.find_linkshell_url(server_id, linkshell)

        for linkshell, linkshell_url in itertools.imap(find_linkshell_url, linkshell_names):
            if linkshell_url:
                results.append(dict(ls_name=linkshell,
                                    ls_url=linkshell_url,
                                    char_url=None))
            else:
                results.append(dict(ls_name=linkshell,
                                    ls_url=None,
                                    char_url=None))

        def find_character_url(linkshell):
            # Finds all characters URLs
            linkshell['char_url'] = self.find_character_url(character_name, linkshell['ls_url'])

        for linkshell in results:
            if linkshell.get('ls_url'):
                pool.spawn(find_character_url, linkshell)

        pool.join()

        return results
def getgome(cat):
    for i in range(3):
        try:
            url = ''.join(('http://www.gome.com.cn/p/json?module=async_search&paramJson={%22pageNumber%22%3A', '1', '%2C%22envReq%22%3A{%22catId%22%3A%22', str(
                cat), '%22%2C%22regionId%22%3A%2231010100%22%2C%22et%22%3A%22%22%2C%22XSearch%22%3Afalse%2C%22pageNumber%22%3A1%2C%22pageSize%22%3A48}}'))
            r = requests.get(url)
            totalpage = int(r.json()['num']['totalPage'])
            urls = [''.join(('http://www.gome.com.cn/p/json?module=async_search&paramJson={%22pageNumber%22%3A', str(i), '%2C%22envReq%22%3A{%22catId%22%3A%22', str(
                cat), '%22%2C%22regionId%22%3A%2231010100%22%2C%22et%22%3A%22%22%2C%22XSearch%22%3Afalse%2C%22pageNumber%22%3A1%2C%22pageSize%22%3A48}}')) for i in xrange(1, totalpage + 1)]

            def ff(url):
                while 1:
                    try:
                        r = requests.get(url, timeout=3)
                        return '\n'.join([i['pId'] for i in r.json()['products']])
                    except:
                        continue
            pp = Pool(30)
            ss = pp.map(ff, urls)
            global jishu
            jishu += 1
            sys.stderr.write(str(jishu) + ' / ' + zongshu + '\r')
            return '\n'.join(ss) + '\n'
        except:
            continue
Ejemplo n.º 22
0
def main():
    num_worker_threads = UPTO
    pool = Pool(num_worker_threads)
    for n in xrange(1, UPTO):
        pool.apply_async(process, args=(n,))
    pool.join()
    print cnt
Ejemplo n.º 23
0
class Zerg:

    def __init__(self, hosts, username, key, max_threads=2):
        self.hosts = hosts
        self.username = username
        self.key = paramiko.RSAKey.from_private_key_file(key)
        self.pool = Pool(max_threads)
        self.connections = []

    def _connect(self, host):
        client = paramiko.SSHClient()
        client.set_missing_host_key_policy(paramiko.WarningPolicy())
        client.connect(host, username=self.username, pkey=self.key)
        self.connections.append(client)

    def connect(self):
        self.pool.map(self._connect, self.hosts)

    def _command(self, cmd, conn):
        stdin, stdout, stderr = conn.exec_command(cmd)
        rc = stdout.channel.recv_exit_status()
        lines = stdout.read().splitlines()
        return rc, lines

    def command(self, cmd):
        out = self.pool.map(lambda c: self._command(cmd, c), self.connections)
        return out
Ejemplo n.º 24
0
 def tracks(self):
     tracks = {}
     
     def lookupTrack(key):
         result = self.spotify.lookup(key, 'trackdetail', priority='low', timeout=MERGE_TIMEOUT)
         track_list = result['album']['tracks']
         
         for track in track_list:
             track_key = track['href']
             
             if track_key not in tracks:
                 data = {
                     'key': track_key,
                     'name': track['name'],
                 }
                 
                 try:
                     # (travis): as of 4/3/12, track length is only sometimes returned by spotify
                     data['length'] = int(track['length']),
                 except KeyError:
                     pass
                 
                 tracks[track_key] = data
     
     size = min(1 + len(self.albums), 20)
     pool = Pool(size)
     
     for album in self.albums:
         key = album['key']
         pool.spawn(lookupTrack, key)
     
     pool.join()
     return list(tracks.values())
Ejemplo n.º 25
0
def sc_process(pid, p_start, p_end):
    # init 
    file_200 = open(res_folder + list_200_pre + str(pid) + file_ext, 'w')
    file_302 = open(res_folder + list_302_pre + str(pid) + file_ext, 'w')
    file_max = open(res_folder + list_max_pre + str(pid) + file_ext, 'w')
    file_others = open(res_folder + list_others_pre + str(pid) + file_ext, 'w')
    file_log = open(res_folder + list_log_pre + str(pid) + file_ext, 'w')
    file_list = [file_200, file_302, file_max, file_others, file_log]

    s = requests.Session()
    pool = Pool(pool_size) 
    for i in xrange(p_start, p_end):
        if i % step == 0:
            file_log.write('%i/%i\n' % (i, p_end))
            file_log.flush()
        shareid = i
        url = url_tpl % shareid
        pool.spawn(sc_worker, pid, s, shareid, url, file_list)
    pool.join()
    
    # finalize
    file_200.close()
    file_302.close()
    file_max.close()
    file_others.close()
    file_log.close()
Ejemplo n.º 26
0
    def handle(self, *args, **options):
        if len(args) < 1 or not(args[0] in self.ZEITRAUM.keys()):
            self.stdout.write("Usage: manage.py fetch {24H|48H|7D|1M|3M|6M|1Y}")
            sys.exit(0)

        gevent.monkey.patch_socket()

        params = {}
        self.s = requests.Session()
        self.tz = timezone.get_current_timezone()
        response, soup = self.post(params)
        stations = soup.find(id=self.STATION_ID).find_all('option')
        for station in stations:
            self.STATIONEN[station.string] = station['value']
        params[self.TARGET_KEY] = self.STATION_KEY

        stationPool = Pool(len(self.STATIONEN))
        self.inv_stations = self.invert_dict(self.STATIONEN)
        self.inv_schadstoff = self.invert_dict(self.SCHADSTOFFE)
        # self.inv_schadstoff['109;2'] = 'PM2.5' # csv uses PM2.5

        for station in self.STATIONEN.keys():
            tmp = params.copy()
            tmp[self.STATION_KEY] = self.STATIONEN[station]
            stationPool.spawn(self.fetchStation, tmp, args[0])

        stationPool.join()
Ejemplo n.º 27
0
 def gen():
     try:
         pool = Pool(len(source_functions))
         sources = []
         
         def _helper(source_function):
             source = source_function()
             if source is not None:
                 sources.append(source)
         
         for source_function in source_functions:
             pool.spawn(_helper, source_function)
         
         pool.join(timeout=initial_timeout)
         
         offset = 0
         found  = True
         
         while found:
             found = False
             
             for source in sources:
                 cur = source(offset, 1)
                 
                 for item in cur:
                     found = True
                     yield item
             
             offset += 1
     except GeneratorExit:
         pass
Ejemplo n.º 28
0
def api_jobs():
    '''
        The Main JSON view which gives a list of all tasks in all projects.
        This route uses a gevent pool go get all the project tasks in parallel
        (31 at a time), which makes life a lot quicker.  (Down to 7 or 8 seconds
        load time for us.)
    '''
    try:
        asana = SimpleAsana(app.config['API_KEY'])
        my_project_tasks = lambda p: get_project_tasks(app.config['API_KEY'], p)

        projects = asana.workspace_projects(
                                        app.config['WORKSPACE'],
                                        cachetime=4000,
                                        as_type='dict',
                                        opt_fields='name,team,archived,notes')
        all_tasks = []

        pool = Pool(31)

        lists = pool.map(my_project_tasks,
                         [p for p in projects if not p['archived']])


        for project_tasks in lists:
            all_tasks += project_tasks

        return jsonify({"tasks": all_tasks})

    except Exception as e: # pylint: disable=broad-except
        return str(e)
Ejemplo n.º 29
0
def test_close_writer():
    pr, pw = pipe()

    big = "x" * 1024 * 50
    print "big data size", len(big)

    def writer():
        print "writing, first round..."
        pw.write(big)
        print "writing, second round..."
        pw.write(big)
        print "writing, end tag..."
        pw.write("END")
        print "writter close"
        pw.close()
        print "writer bye bye"

    def reader():
        print "reading all..."
        data = pr.read()
        assert len(data) == len(big) * 2 + 3
        assert data[-3:] == "END"
        print "reader bye bye"

    p = Pool()
    p.spawn(reader)
    p.spawn(writer)
    p.join(raise_error=True)
Ejemplo n.º 30
0
class GEvent2Worker(Worker):
    
    base_env = {
        'GATEWAY_INTERFACE': 'CGI/1.1',
        'SERVER_SOFTWARE': 'gevent/%s gunicorn/%s' % (gevent.__version__,
                                                    gunicorn.__version__),
        'SCRIPT_NAME': '',
        'wsgi.version': (1, 0),
        'wsgi.url_scheme': 'http',
        'wsgi.multithread': False,
        'wsgi.multiprocess': True,
        'wsgi.run_once': False
    }
    
    def __init__(self, *args, **kwargs):
        super(GEvent2Worker, self).__init__(*args, **kwargs)
        self.worker_connections = self.cfg.worker_connections
        self.pool = None
    
    @classmethod
    def setup(cls):
        from gevent import monkey
        monkey.patch_all(dns=False)
   
    def handle_request(self, req):
        self.pool.spawn(self.handle, req)
       
    def handle(self, req):
        handle = WSGIHandler(req)
        handle.handle(self)
        
    def run(self):
        self.socket.setblocking(1)
        env = self.base_env.copy()
        
        env.update({
            'SERVER_NAME': self.address[0],
            'SERVER_PORT': str(self.address[1]) 
        })
        self.base_env = env
        
        http = core.http()
        http.set_gencb(self.handle_request)
        self.pool = Pool(self.worker_connections)
        
        self.application = self.wsgi
        acceptor = gevent.spawn(http.accept, self.socket.fileno())
        
        try:
            while self.alive:
                self.notify()
            
                if self.ppid != os.getppid():
                    self.log.info("Parent changed, shutting down: %s" % self)
                    gevent.kill(acceptor)
                    break
                gevent.sleep(0.1)            
            self.pool.join(timeout=self.timeout)
        except KeyboardInterrupt:
            pass
Ejemplo n.º 31
0
    def init(self, inventory, config, initial_limit=None):
        # Config validation
        #

        # If no config, create one using the defaults
        if config is None:
            config = Config()

        # Error if our min version is not met
        if config.MIN_PYINFRA_VERSION is not None:
            # TODO: remove this
            if config.REQUIRE_PYINFRA_VERSION is None:
                config.REQUIRE_PYINFRA_VERSION = '>={0}'.format(
                    config.MIN_PYINFRA_VERSION)
                logger.warning(
                    '`MIN_PYINFRA_VERSION` is deprecated, please use `REQUIRE_PYINFRA_VERSION`.',
                )
            else:
                logger.warning(
                    'Ignoring legacy `MIN_PYINFRA_VERSION` because '
                    '`REQUIRE_PYINFRA_VERSION` also exists.', )

        if config.REQUIRE_PYINFRA_VERSION is not None:
            running_version = parse_version(__version__)
            required_versions = Requirement.parse(
                'pyinfra{0}'.format(config.REQUIRE_PYINFRA_VERSION), )

            if running_version not in required_versions:
                raise PyinfraError(('pyinfra version requirement not met '
                                    '(requires {0}, running {1})').format(
                                        config.REQUIRE_PYINFRA_VERSION,
                                        __version__,
                                    ))

        if config.REQUIRE_PACKAGES is not None:
            if isinstance(config.REQUIRE_PACKAGES, (list, tuple)):
                requirements = config.REQUIRE_PACKAGES
            else:
                with open(path.join(self.deploy_dir,
                                    config.REQUIRE_PACKAGES)) as f:
                    requirements = [
                        line.split('#egg=')[-1]
                        for line in f.read().splitlines()
                    ]

            try:
                require(requirements)
            except ResolutionError as e:
                raise PyinfraError(
                    'Deploy requirements ({0}) not met: {1}'.format(
                        config.REQUIRE_PACKAGES,
                        e,
                    ))

        if not config.PARALLEL:
            # TODO: benchmark this
            # In my own tests the optimum number of parallel SSH processes is
            # ~20 per CPU core - no science here yet, needs benchmarking!
            cpus = cpu_count()
            ideal_parallel = cpus * 20

            config.PARALLEL = (min(ideal_parallel, len(inventory),
                                   MAX_PARALLEL) if MAX_PARALLEL is not None
                               else min(ideal_parallel, len(inventory)))

        # If explicitly set, just issue a warning
        elif MAX_PARALLEL is not None and config.PARALLEL > MAX_PARALLEL:
            logger.warning((
                'Parallel set to {0}, but this may hit the open files limit of {1}.\n'
                '    Max recommended value: {2}').format(
                    config.PARALLEL, nofile_limit, MAX_PARALLEL))

        # Actually initialise the state object
        #

        self.callback_handlers = []

        # Setup greenlet pools
        self.pool = Pool(config.PARALLEL)
        self.fact_pool = Pool(config.PARALLEL)

        # Connection storage
        self.ssh_connections = {}
        self.sftp_connections = {}

        # Private keys
        self.private_keys = {}

        # Assign inventory/config
        self.inventory = inventory
        self.config = config

        # Hosts we've activated at any time
        self.activated_hosts = set()
        # Active hosts that *haven't* failed yet
        self.active_hosts = set()
        # Hosts that have failed
        self.failed_hosts = set()

        # Limit hosts changes dynamically to limit operations to a subset of hosts
        self.limit_hosts = initial_limit

        # Op basics
        self.op_line_numbers_to_hash = {}
        self.op_meta = {}  # maps operation hash -> names/etc
        self.ops_run = set()  # list of ops which have been started/run

        # Op dict for each host
        self.ops = {host: {} for host in inventory}

        # Facts dict for each host
        self.facts = {host: {} for host in inventory}

        # Meta dict for each host
        self.meta = {
            host: {
                'ops': 0,  # one function call in a deploy file
                'commands': 0,  # actual # of commands to run
                'op_hashes': set(),
            }
            for host in inventory
        }

        # Results dict for each host
        self.results = {
            host: {
                'ops': 0,  # success_ops + failed ops w/ignore_errors
                'success_ops': 0,
                'error_ops': 0,
                'commands': 0,
            }
            for host in inventory
        }

        # Assign state back references to inventory & config
        inventory.state = config.state = self
        for host in inventory:
            host.state = self

        self.initialised = True

        # Flag to track added users (via `server.user` operation calls). This is
        # specifically to address users not existing during fact gathering phase
        # causing failures with su_user/sudo_user. If we expect to add the user
        # those facts should not fail but default.
        self.will_add_users = []
            def ff(url):
                while 1:
                    try:
                        r = requests.get(url, timeout=3)
                        return '\n'.join([i['pId'] for i in r.json()['products']])
                    except:
                        continue
            pp = Pool(30)
            ss = pp.map(ff, urls)
            try:
                pp.close()
                pp.join()
            except:
                pass
            global jishu
            jishu += 1
            sys.stderr.write(str(jishu) + ' / ' + zongshu + '\r')
            return '\n'.join(ss) + '\n'
        except:
            continue

with open('allcategory.txt') as f:
    allcategory = [i.strip() for i in f.readlines()]
zongshu = str(len(allcategory))
jishu = 0
with open('allids.txt', 'w') as f:
    # 这里又开了Pool,但一开始我是用for单线程做的,因为每个类目已经开了多线程,结果用了很久
    p1 = Pool(50)
    ss = p1.map(getgome, allcategory)
    f.writelines(ss)
Ejemplo n.º 33
0
class DoubanSpider(DBMixin):
    """" 豆瓣爬虫 """
    def __init__(self, proxy_manager=None):
        self.result_page = self.db.result_page
        self.result_topic = self.db.result_topic
        self.cache = self.db.cache_page

        self.group_list = GROUP_LIST
        self.rules = RULES
        self.interval = WATCH_INTERVAL

        self.pool = Pool(size=POOL_SIZE)
        self.page_queue = Queue()
        self.topic_queue = Queue()

        self.proxy_manager = proxy_manager

    def fetch(self, url, timeout=10, retury_num=10):
        """发起HTTP请求

        @url, str, URL
        @timeout, int, 超时时间
        @retury_num, int, 重试次数
        """
        kwargs = {
            "headers": {
                "User-Agent": USER_AGENT,
                "Referer": "https://www.douban.com/"
            },
        }
        kwargs["timeout"] = timeout
        resp = None
        proxy = None
        for i in range(retury_num):
            try:
                # 是否启动代理
                if self.proxy_manager is not None:
                    proxy = self.proxy_manager.get_proxy()
                    kwargs["proxies"] = {
                        "http": 'http://%s' % proxy,
                        "https": 'https://%s' % proxy
                    }
                    # print('proxies: ', kwargs['proxies'])
                resp = requests.get(url, **kwargs)
                if resp.status_code != 200:
                    raise HTTPError(resp.status_code, url)
                break
            except Exception as exc:
                logger.warn("%s %d failed!\n%s", url, i, str(exc))
                self.proxy_manager.remove(proxy)
                time.sleep(2)
                continue

        if resp is None:
            raise URLFetchError(url)
        return resp.content.decode('utf-8')

    def extract(self, regx, body, multi=False):
        """解析元素,xpath语法

        @regx, str, 解析表达式
        @body, str or element, 网页源码或元素
        @multi, bool, 是否取多个
        """
        if isinstance(body, str):
            body = etree.HTML(body)
        res = body.xpath(regx)
        if multi:
            return res
        return res[0] if res else None

    def run(self):
        """run
        """
        all_greenlet = []
        # 定时爬取
        for group_url in self.group_list:
            # timer = Timer(random.randint(0, self.interval), self.interval)
            timer = Timer(random.randint(0, 2), self.interval)
            greenlet = gevent.spawn(timer.run, self._init_page_tasks,
                                    group_url)
            all_greenlet.append(greenlet)
        # 生产 & 消费
        all_greenlet.append(gevent.spawn(self._page_loop))
        all_greenlet.append(gevent.spawn(self._topic_loop))
        # 重载代理,10分
        proxy_timer = Timer(PROXY_INTERVAL, PROXY_INTERVAL)
        all_greenlet.append(gevent.spawn(proxy_timer.run(self.reload_proxies)))
        gevent.joinall(all_greenlet)

    def reload_proxies(self):
        """重新加载代理
        """
        self.proxy_manager.reload_proxies()

    def _init_page_tasks(self, group_url):
        """初始化页面任务

        @group_url, str, 小组URL
        """
        for page in range(MAX_PAGE):
            base_url = "%s%s" % (group_url, GROUP_SUFFIX)
            url = base_url % (page * 25)
            self.page_queue.put(url)

    def _page_loop(self):
        """page loop
        """
        while 1:
            page_url = self.page_queue.get(block=True)
            gevent.sleep(1)
            self.pool.spawn(self._crawl_page, page_url)

    def _topic_loop(self):
        """topic loop
        """
        while 1:
            topic_url = self.topic_queue.get(block=True)
            self.pool.spawn(self._crawl_detail, topic_url)

    def _crawl_page(self, url):
        """爬取帖子

        @url, str, 当前页面URL
        """
        logger.info("processing page: %s", url)
        html = self.fetch(url)
        topic_urls = self.extract(self.rules["url_list"], html, multi=True)

        # 找出新增的帖子URL
        diff_urls = self._diff_urls(topic_urls)
        if not diff_urls:
            logger.info("%s no update ...", url)
            return

        logger.info("%s new add : %d", url, len(diff_urls))
        topic_list = self.extract(self.rules["topic_item"], html, multi=True)

        # 获取每一页的信息
        topics = self._get_page_info(topic_list)

        # 过滤,找到新增的和之前的帖子
        new_topics, old_topics = self._filter_topics(topics, diff_urls)

        # 保存每页的信息
        self.result_page.insert(new_topics)

        # 更新老帖子的时间和回复数
        self._update_old_topics(old_topics)

        # 初始化帖子任务
        self._init_topic_tasks(diff_urls)

        # 更新缓存
        self._update_cache(diff_urls)

    def _get_page_info(self, topic_list):
        """获取每一页的帖子基本信息

        @topic_list, list, 当前页的帖子项
        """
        topics = []
        # 第一行是标题头,舍掉
        for topic_item in topic_list[1:]:
            topic = {}
            topic["title"] = self.extract(self.rules["title"], topic_item)
            topic["author"] = self.extract(self.rules["author"], topic_item)
            topic["reply"] = self.extract(self.rules["reply"], topic_item) or 0
            topic["last_reply_time"] = self.extract(
                self.rules["last_reply_time"], topic_item)
            topic["url"] = self.extract(self.rules["url"], topic_item)
            now = time.time()
            topic["got_time"] = now
            topic["last_update_time"] = now
            # print('page info topic: {}'.format(topic))
            if not self._is_intermediary(topic['author'], topic['title'],
                                         None):
                topics.append(topic)
        return topics

    @staticmethod
    def _filter_topics(topics, diff_urls):
        """过滤帖子,找出新增的和老的帖子

        @topics, list, 当前页所有帖子信息
        @diff_urls, list, 新增的帖子URL
        """
        new_topics, old_topics = [], []
        for topic in topics:
            if topic["url"] in diff_urls:
                new_topics.append(topic)
            else:
                old_topics.append(topic)
        return new_topics, old_topics

    def _diff_urls(self, topic_urls):
        """过滤重复帖子URL

        @topic_urls, list, 当前页所有帖子URL
        """
        # 与缓存比较
        cache_urls = []
        cursor = self.cache.find()
        for item in cursor:
            cache_urls.extend(item["urls"])
        # 找出新增的URL
        diff_urls = list(set(topic_urls) - set(cache_urls))
        return diff_urls

    def _update_old_topics(self, old_topics):
        """更新老帖子的信息,标题,回应时间和回复数量

        @old_topics, list, 老帖子列表
        """
        for topic in old_topics:
            new_info = {
                "title": topic["title"],
                "reply": topic["reply"],
                "last_reply_time": topic["last_reply_time"],
                "last_update_time": time.time()
            }
            self.result_page.update({"url": topic["url"]}, {"$set": new_info})
            logger.info("%s updated ...", topic["url"])

    def _init_topic_tasks(self, topic_urls):
        """初始化帖子任务

        @topic_urls, list, 当前页面帖子的URL
        """
        for url in topic_urls:
            self.topic_queue.put(url)

    def _update_cache(self, diff_urls):
        """更新缓存

        @diff_urls, list, 新增的帖子URL
        """
        self.cache.insert({"got_time": time.time(), "urls": diff_urls})

    def _crawl_detail(self, url):
        """爬取每个帖子的详情

        @url, str, 每个帖子的URL
        """
        logger.info("processing topic: %s", url)
        html = self.fetch(url)

        # 获取每一页的信息
        topic = self._get_detail_info(html, url)

        if not topic:
            # self.topic_queue.put(url)
            return

        topic["url"] = url
        topic["got_time"] = time.time()

        # 不存在 & 保存帖子的信息
        if self.result_topic.find_one({"url": url}):
            return
        self.result_topic.insert(topic)

    def _get_detail_info(self, html, url):
        """获取帖子详情

        @html, str, 页面
        """
        if "机器人" in html:
            logger.warn("%s 403.html", url)
            return None

        topic = {}
        title = self.extract(self.rules["detail_title_sm"], html) \
            or self.extract(self.rules["detail_title_lg"], html)

        if title is None:
            return None

        topic["title"] = title.strip()

        topic["create_time"] = self.extract(self.rules["create_time"], html)

        topic["author"] = self.extract(self.rules["detail_author"], html)
        topic["content"] = '\n'.join(
            self.extract(self.rules["content"], html, multi=True))
        # print('detail topic: {}'.format(topic))

        if self._is_intermediary(topic['author'], topic['title'],
                                 topic['content']):
            return None

        return topic

    def _is_intermediary(self, author, title, content=None):
        """根据关键词, 内容和豆瓣用户名等判断是否为中介"""
        full_text = title
        if content is not None:
            if len(content) < 20 or len(content) > 500 or content == title:
                return True
            full_text += content

        if author.startswith('豆友') or author.find('直租') != -1:
            return True

        exclamation_count = full_text.count('!') + full_text.count('!')
        if exclamation_count >= 3:
            return True

        for kw in INTERMEDIARY_KEYWORDS:
            if full_text.find(kw) != -1:
                return True
        return False
Ejemplo n.º 34
0
from gtwisted.core.asyncresultfactory import AsyncResultFactory
from gtwisted.core.error import RPCDataTooLongError
from gevent.timeout import Timeout
from gevent.pool import Pool
from gfirefly.server.logobj import logger
import struct
import rpc_pb2
import marshal

ASK_SIGNAL = "ASK"  # 请求结果的信号
NOTICE_SIGNAL = "NOTICE"  # 仅做通知的信号,不要求返回值
ANSWER_SIGNAL = "ANSWER"  # 返回结果值的信号
DEFAULT_TIMEOUT = 60  # 默认的结果放回超时时间
RPC_DATA_MAX_LENGTH = 1024 * 1024  # rpc数据包允许的最大长度

GEVENT_POOL = Pool(500)


def _write_parameter(proto, arg):
    if isinstance(arg, str):
        proto.proto_param = arg
    elif isinstance(arg, bool):
        proto.bool_param = arg
    elif isinstance(arg, unicode):
        proto.string_param = arg
    elif isinstance(arg, int) or isinstance(arg, long):
        proto.int_param = arg
    elif isinstance(arg, float):
        proto.float_param = arg
    elif arg is None:
        proto.is_null = True
Ejemplo n.º 35
0
    dest = utils.parse_mongo_url(args.dest)
    dest_client = utils.mongo_connect(dest['host'], dest['port'],
                                      max_pool_size=POOL_SIZE,
                                      document_class=FasterOrderedDict)
    dest_collection = dest_client[dest['db']][dest['collection']]

    if source == dest:
        raise ValueError("source and destination cannot be the same!")

    # periodically print stats
    stats = Stats()
    stats_greenlet = gevent.spawn(stats_worker, stats)

    # copy documents!
    pool = Pool(POOL_SIZE)
    with open(args.mismatches_file) as mismatches_file:
        lines = mismatches_file.readlines()  # copy everything into memory -- hopefully that isn't huge
    stats.total = len(lines)
    for line in lines:
        query_doc = {'_id': MismatchLogger.decode_mismatch_id(line)}
        pool.spawn(copy_document_worker,
                   query_doc=query_doc,
                   source_collection=source_collection,
                   dest_collection=dest_collection,
                   stats=stats)

    # wait for everythng to finish
    gevent.sleep()
    pool.join()
    stats_greenlet.kill()
Ejemplo n.º 36
0
    def handle_download(self, directory, container, threads, verbose):
        @self.requires_auth
        def _download(i, files, directory, errors):
            if verbose:
                print_('Starting thread %s' % i)
            s = requests.Session()
            directory = os.path.abspath(directory)
            for filename in files:
                if verbose > 1:
                    print_('Downloading %s' % filename)
                try:
                    path = os.path.join(directory, filename)
                    try:
                        os.makedirs(os.path.dirname(path), 493)  # 0755
                    except OSError as e:
                        if e.errno != 17:
                            raise
                    with open(path, 'wb+') as f:
                        r = s.get('%s/%s/%s' %
                                  (self.endpoint, container, filename),
                                  headers={'X-Auth-Token': self.token},
                                  stream=True)
                        if r.status_code == 401:
                            raise AuthenticationError
                        for block in r.iter_content(4096):
                            if not block:
                                break
                            f.write(block)
                except:
                    e = sys.exc_info()[1]
                    errors.append({
                        'name': filename,
                        'container': container,
                        'exception': str(e)
                    })
                else:
                    if r.status_code != 200:
                        errors.append({
                            'name': filename,
                            'container': container,
                            'status_code': r.status_code,
                            'headers': r.headers,
                            'response': json.loads(r.text)
                        })
            if verbose:
                print_('Completed thread %s' % i)

        files = collections.defaultdict(list)
        thread_mark = threads
        files_per_thread = len(self.objects) / threads / 3
        i = 0
        for o in self.objects:
            files[i].append(o['name'])
            i += 1
            if len(files[thread_mark - 1]) == files_per_thread:
                thread_mark += threads
                files_per_thread = files_per_thread / 2
                i = 0
            if i == thread_mark:
                i = 0

        pool = Pool(size=threads)
        errors = []
        for i, file_chunk in iteritems(files):
            pool.spawn(_download, i, file_chunk, directory, errors)
        pool.join()
        return errors
Ejemplo n.º 37
0
    def run(self):
        servers = []
        ssl_args = {}

        if self.cfg.is_ssl:
            ssl_args = dict(server_side=True,
                            do_handshake_on_connect=False,
                            **self.cfg.ssl_options)

        for s in self.sockets:
            s.setblocking(1)
            pool = Pool(self.worker_connections)
            if self.server_class is not None:
                server = self.server_class(s,
                                           application=self.wsgi,
                                           spawn=pool,
                                           log=self.log,
                                           handler_class=self.wsgi_handler,
                                           **ssl_args)
            else:
                hfun = partial(self.handle, s)
                server = StreamServer(s, handle=hfun, spawn=pool, **ssl_args)

            server.start()
            servers.append(server)

        pid = os.getpid()
        try:
            while self.alive:
                self.notify()

                if pid == os.getpid() and self.ppid != os.getppid():
                    self.log.info("Parent changed, shutting down: %s", self)
                    break

                gevent.sleep(1.0)

        except KeyboardInterrupt:
            pass

        try:
            # Stop accepting requests
            [server.stop_accepting() for server in servers]

            # Handle current requests until graceful_timeout
            ts = time.time()
            while time.time() - ts <= self.cfg.graceful_timeout:
                accepting = 0
                for server in servers:
                    if server.pool.free_count() != server.pool.size:
                        accepting += 1

                # if no server is accepting a connection, we can exit
                if not accepting:
                    return

                self.notify()
                gevent.sleep(1.0)

            # Force kill all active the handlers
            self.log.warning("Worker graceful timeout (pid:%s)" % self.pid)
            [server.stop(timeout=1) for server in servers]
        except:
            pass
Ejemplo n.º 38
0
 def _run(self, *args, **kwargs):
     pool = Pool(size=self._config.get("concurrency", None))
     for task in self._tasks:
         pool.start(Greenlet(task))
     pool.join(raise_error=True)
Ejemplo n.º 39
0
    def query_activities(self,
                         activity_ids=None,
                         limit=None,
                         after=None,
                         before=None,
                         only_ids=False,
                         summaries=True,
                         streams=False,
                         owner_id=False,
                         build_index=True,
                         pool=None,
                         out_queue=None,
                         cache_timeout=CACHE_ACTIVITIES_TIMEOUT,
                         **kwargs):

        if self.indexing():
            return [{
                "error":
                "Building activity index for {}".format(self.id) +
                "...<br>Please try again in a few seconds.<br>"
            }]

        # convert date strings to datetimes, if applicable
        if before or after:
            try:
                after = self.__class__.to_datetime(after)
                if before:
                    before = self.__class__.to_datetime(before)
                    assert (before > after)
            except AssertionError:
                return [{"error": "Invalid Dates"}]

        # app.logger.info("query_activities called with: {}".format({
        #     "activity_ids": activity_ids,
        #     "limit": limit,
        #     "after": after,
        #     "before": before,
        #     "only_ids": only_ids,
        #     "summaries": summaries,
        #     "streams": streams,
        #     "owner_id": owner_id,
        #     "build_index": build_index,
        #     "pool": pool,
        #     "out_queue": out_queue
        # }))

        def import_streams(client, queue, activity):
            # app.logger.debug("importing {}".format(activity["id"]))

            stream_data = Activities.import_streams(client, activity["id"],
                                                    STREAMS_TO_CACHE,
                                                    cache_timeout)

            data = {
                s: stream_data[s]
                for s in STREAMS_OUT + ["error"] if s in stream_data
            }
            data.update(activity)
            queue.put(data)
            # app.logger.debug("importing {}...queued!".format(activity["id"]))
            gevent.sleep(0)

        pool = pool or Pool(CONCURRENCY)
        client = self.client()

        #  If out_queue is not supplied then query_activities is blocking
        put_stopIteration = False
        if not out_queue:
            out_queue = Queue()
            put_stopIteration = True

        index_df = None
        if (summaries or limit or only_ids or after or before):
            activity_index = self.get_index()

            if activity_index:
                index_df = activity_index["index_df"]
                elapsed = (datetime.utcnow() -
                           activity_index["dt_last_indexed"]).total_seconds()

                # update the index if we need to
                if (not OFFLINE) and (elapsed > INDEX_UPDATE_TIMEOUT):
                    index_df = self.update_index(index_df)

                if (not activity_ids):
                    # only consider activities with a summary polyline
                    ids_df = (index_df[index_df.summary_polyline.notnull(
                    )].set_index("ts_local").sort_index(ascending=False).id)

                    if limit:
                        ids_df = ids_df.head(int(limit))

                    elif before or after:
                        #  get ids of activities in date-range
                        if after:
                            ids_df = ids_df[:after]
                        if before:
                            ids_df = ids_df[before:]

                    activity_ids = ids_df.tolist()

                index_df = index_df.astype(
                    Users.index_df_out_dtypes).set_index("id")

                if only_ids:
                    out_queue.put(activity_ids)
                    out_queue.put(StopIteration)
                    return out_queue

                def summary_gen():
                    for aid in activity_ids:
                        A = {"id": int(aid)}
                        if summaries:
                            A.update(index_df.loc[int(aid)].to_dict())
                        # app.logger.debug(A)
                        yield A

                gen = summary_gen()

            elif build_index:
                # There is no activity index and we are to build one
                if only_ids:
                    return ["build"]

                else:
                    gen = Queue()
                    gevent.spawn(self.build_index, gen, limit, after, before,
                                 activity_ids)
            else:
                # Finally, if there is no index and rather than building one
                # we are requested to get the summary data directily from Strava
                # app.logger.info(
                #     "{}: getting summaries from Strava without build"
                #     .format(self))
                gen = (Activities.strava2dict(a)
                       for a in self.client().get_activities(
                           limit=limit, before=before, after=after))

        for A in gen:
            if "stop_rendering" in A:
                pool.join()

            if "id" not in A:
                out_queue.put(A)
                continue

            if summaries:
                if ("bounds" not in A):
                    A["bounds"] = Activities.bounds(A["summary_polyline"])

                A["ts_local"] = str(A["ts_local"])

                # TODO: do this on the client
                A.update(Activities.atype_properties(A["type"]))

            if owner_id:
                A.update({"owner": self.id, "profile": self.profile})

            if not streams:
                out_queue.put(A)

            else:
                stream_data = Activities.get(A["id"])

                if stream_data:
                    A.update(stream_data)
                    if ("bounds" not in A):
                        A["bounds"] = Activities.bounds(A["polyline"])
                    out_queue.put(A)

                elif not OFFLINE:
                    pool.spawn(Activities.import_and_queue_streams, client,
                               out_queue, A)
                gevent.sleep(0)

        # If we are using our own queue, we make sure to put a stopIteration
        #  at the end of it so we have to wait for all import jobs to finish.
        #  If the caller supplies a queue, can return immediately and let them
        #   handle responsibility of adding the stopIteration.
        if put_stopIteration:
            pool.join()
            out_queue.put(StopIteration)

        return out_queue
Ejemplo n.º 40
0
Archivo: main.py Proyecto: wwsit/spider
class XiMaLaYaAllDataSpider():
    """喜马拉雅爬虫"""
    def __init__(self):
        """初始化"""
        self.basic_url = 'https://m.ximalaya.com/m-revision/common/album/queryAlbumTrackRecordsByPage?albumId=203355&page={}&pageSize=7'
        self.headers = {
            "User-Agent":
            "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Mobile Safari/537.36"
        }
        self.pool = Pool(3)
        self.url_queue = Queue()
        self.list_data = []

    def fun(self, blocknum, bs, size):
        """显示下载的进度"""
        percent = blocknum * bs / size
        percent = percent * 100
        int_data = int(percent)
        if int_data % 10 == 0:
            if int_data not in self.list_data:
                print("id:%s-----download: %d%%" % (blocknum, int_data))
                self.list_data.append(int_data)

    def get_url(self):
        """获取url列表"""

        for page in range(10):
            url = self.basic_url.format(page)
            # 将获取的url 存放进队列中
            self.url_queue.put(url)

    def exec_task(self):
        """定义执行任务代码"""
        # 从队列中获取url
        url = self.url_queue.get()
        resp = requests.get(url=url, headers=self.headers)
        # 提取数据  列表数据
        result = json.loads(resp.content)['data']["trackDetailInfos"]

        for sigle_data in result:
            """单个数据下的内容"""
            item = {}
            item["url"] = sigle_data["trackInfo"]["playPath"]
            item['name'] = sigle_data["trackInfo"]["title"][6:]
            print(item)
            # 下载音频
            down_url = request.urlretrieve(url=item['url'],
                                           filename='./down_file/' +
                                           item["name"] + '.mp3',
                                           reporthook=self.fun)
        print('*' * 50)
        time.sleep(2)
        self.url_queue.task_done()

    def exec_task_finished(self, result):
        """定义任务执行代码完成后回调"""
        self.pool.apply_async(self.exec_task, callback=self.exec_task_finished)

    def run(self):
        # 把所有url放入队列中
        self.get_url()

        #  让任务在线程池中的线程执行
        # callback 表示当任务执行完成后的回调函数
        # 默认情况下线程池中的线程都是守护线程
        for i in range(3):
            self.pool.apply_async(self.exec_task,
                                  callback=self.exec_task_finished)

        self.url_queue.join()
Ejemplo n.º 41
0
from gevent.pool import Pool

from opentracing_utils import trace, extract_span_from_kwargs

from app.config import MAX_QUERY_TIME_SLICE, UPDATER_CONCURRENCY
from app.extensions import db
from app.libs.zmon import query_sli

from .models import IndicatorValue, Indicator
from .models import insert_indicator_value

MIN_VAL = math.expm1(1e-10)

logger = logging.getLogger(__name__)

updater_pool = Pool(UPDATER_CONCURRENCY)


def update_all_indicators(app: Flask):
    """
    Update all indicators async!
    """
    if os.environ.get('SLR_LOCAL_ENV'):
        warnings.warn(
            'Running on local env while not setting up gevent properly!')

    for indicator in Indicator.query.all():
        try:
            if indicator.is_deleted is True:
                continue
            updater_pool.spawn(update_indicator, app, indicator)
Ejemplo n.º 42
0
    def related_activities(self,
                           activity_id,
                           streams=False,
                           pool=None,
                           out_queue=None):
        client = self.client()

        put_stopIteration = True if not out_queue else False

        out_queue = out_queue or Queue()
        pool = pool or Pool(CONCURRENCY)

        trivial_list = []

        # First we put this activity
        try:
            A = client.get_activity(int(activity_id))
        except Exception as e:
            app.logger.info("Error getting this activity: {}".format(e))
        else:
            trivial_list.append(A)

        try:
            related_activities = list(
                client.get_related_activities(int(activity_id)))

        except Exception as e:
            app.logger.info("Error getting related activities: {}".format(e))
            return [{"error": str(e)}]

        for obj in itertools.chain(related_activities, trivial_list):
            if streams:
                owner = self.__class__.get(obj.athlete.id)

                if owner:
                    # the owner is a Heatflask user
                    A = Activities.strava2dict(obj)
                    A["ts_local"] = str(A["ts_local"])
                    A["owner"] = owner.id
                    A["profile"] = owner.profile
                    A["bounds"] = Activities.bounds(A["summary_polyline"])
                    A.update(Activities.atype_properties(A["type"]))

                    stream_data = Activities.get(obj.id)
                    if stream_data:
                        A.update(stream_data)
                        out_queue.put(A)
                    else:
                        pool.spawn(Activities.import_and_queue_streams,
                                   owner.client(), out_queue, A)
            else:
                # we don't care about activity streams
                A = Activities.strava2dict(obj)

                A["ts_local"] = str(A["ts_local"])
                A["profile"] = "/avatar/athlete/medium.png"
                A["owner"] = obj.athlete.id
                A["bounds"] = Activities.bounds(A["summary_polyline"])
                A.update(Activities.atype_properties(A["type"]))
                out_queue.put(A)

        if put_stopIteration:
            out_queue.put(StopIteration)

        return out_queue
Ejemplo n.º 43
0
def run_many(tests, expected=None, failfast=False):
    global NWORKERS, pool
    start = time()
    total = 0
    failed = {}

    NWORKERS = min(len(tests), NWORKERS)
    pool = Pool(NWORKERS)
    util.BUFFER_OUTPUT = NWORKERS > 1

    def run_one(cmd, **kwargs):
        result = util.run(cmd, **kwargs)
        if result:
            if failfast:
                sys.exit(1)
            # the tests containing AssertionError might have failed because
            # we spawned more workers than CPUs
            # we therefore will retry them sequentially
            failed[result.name] = [
                cmd, kwargs, 'AssertionError' in (result.output or '')
            ]

    try:
        try:
            for cmd, options in tests:
                total += 1
                spawn(run_one, cmd, **(options or {}))
            gevent.wait()
        except KeyboardInterrupt:
            try:
                if pool:
                    util.log('Waiting for currently running to finish...')
                    pool.join()
            except KeyboardInterrupt:
                util.report(total,
                            failed,
                            exit=False,
                            took=time() - start,
                            expected=expected)
                util.log('(partial results)\n')
                raise
    except:
        traceback.print_exc()
        pool.kill()  # this needed to kill the processes
        raise

    toretry = [
        key for (key, (cmd, kwargs, can_retry)) in failed.items() if can_retry
    ]
    failed_then_succeeded = []

    if NWORKERS > 1 and toretry:
        util.log('\nWill retry %s failed tests sequentially:\n- %s\n',
                 len(toretry), '\n- '.join(toretry))
        for name, (cmd, kwargs, _ignore) in failed.items():
            if not util.run(cmd, buffer_output=False, **kwargs):
                failed.pop(name)
                failed_then_succeeded.append(name)

    if failed_then_succeeded:
        util.log(
            '\n%s tests failed during concurrent run but succeeded when ran sequentially:',
            len(failed_then_succeeded))
        util.log('- ' + '\n- '.join(failed_then_succeeded))

    util.log('gevent version %s from %s', gevent.__version__, gevent.__file__)
    util.report(total, failed, took=time() - start, expected=expected)
    assert not pool, pool
Ejemplo n.º 44
0
    def init(self, inventory, config, initial_limit=None):
        # Config validation
        #

        # If no config, create one using the defaults
        if config is None:
            config = Config()

        # Error if our min version is not met
        if config.MIN_PYINFRA_VERSION is not None:
            running_version = parse_version(__version__)
            needed_version = parse_version(
                # Version must be a string
                six.text_type(config.MIN_PYINFRA_VERSION), )

            if needed_version > running_version:
                raise PyinfraError(('Minimum pyinfra version not met '
                                    '(minimum={0}, running={1})').format(
                                        config.MIN_PYINFRA_VERSION,
                                        __version__,
                                    ))

        if not config.PARALLEL:
            # TODO: benchmark this
            # In my own tests the optimum number of parallel SSH processes is
            # ~20 per CPU core - no science here yet, needs benchmarking!
            cpus = cpu_count()
            ideal_parallel = cpus * 20

            config.PARALLEL = (min(ideal_parallel, len(inventory),
                                   MAX_PARALLEL) if MAX_PARALLEL is not None
                               else min(ideal_parallel, len(inventory)))

        # If explicitly set, just issue a warning
        elif MAX_PARALLEL is not None and config.PARALLEL > MAX_PARALLEL:
            logger.warning((
                'Parallel set to {0}, but this may hit the open files limit of {1}.\n'
                '    Max recommended value: {2}').format(
                    config.PARALLEL, nofile_limit, MAX_PARALLEL))

        # Actually initialise the state object
        #

        # Setup greenlet pools
        self.pool = Pool(config.PARALLEL)
        self.fact_pool = Pool(config.PARALLEL)

        # Connection storage
        self.ssh_connections = {}
        self.sftp_connections = {}

        # Private keys
        self.private_keys = {}

        # Facts storage
        self.facts = {}
        self.fact_locks = {}

        # Assign inventory/config
        self.inventory = inventory
        self.config = config

        # Hosts we've activated at any time
        self.activated_hosts = set()
        # Active hosts that *haven't* failed yet
        self.active_hosts = set()
        # Hosts that are ready to be deployed to
        self.ready_hosts = set()
        # Hosts that have failed
        self.failed_hosts = set()

        # Limit hosts changes dynamically to limit operations to a subset of hosts
        self.limit_hosts = initial_limit

        # Op basics
        self.op_line_numbers_to_hash = {}
        self.op_meta = {}  # maps operation hash -> names/etc
        self.ops_run = set()  # list of ops which have been started/run

        # Op dict for each host
        self.ops = {host: {} for host in inventory}

        # Facts dict for each host
        self.facts = {host: {} for host in inventory}

        # Meta dict for each host
        self.meta = {
            host: {
                'ops': 0,  # one function call in a deploy file
                'commands': 0,  # actual # of commands to run
                'op_hashes': set(),
            }
            for host in inventory
        }

        # Results dict for each host
        self.results = {
            host: {
                'ops': 0,  # success_ops + failed ops w/ignore_errors
                'success_ops': 0,
                'error_ops': 0,
                'commands': 0,
            }
            for host in inventory
        }

        # Assign state back references to inventory & config
        inventory.state = config.state = self

        self.initialised = True
Ejemplo n.º 45
0
    def run(self):
        servers = []
        ssl_args = {}

        if self.cfg.is_ssl:
            ssl_args = dict(server_side=True, **self.cfg.ssl_options)

        for s in self.sockets:
            s.setblocking(1)
            pool = Pool(self.worker_connections)
            if self.server_class is not None:
                environ = base_environ(self.cfg)
                environ.update({
                    "wsgi.multithread": True,
                    "SERVER_SOFTWARE": VERSION,
                })
                server = self.server_class(s,
                                           application=self.wsgi,
                                           spawn=pool,
                                           log=self.log,
                                           handler_class=self.wsgi_handler,
                                           environ=environ,
                                           **ssl_args)
            else:
                hfun = partial(self.handle, s)
                server = StreamServer(s, handle=hfun, spawn=pool, **ssl_args)

            server.start()
            servers.append(server)

        while self.alive:
            self.notify()
            gevent.sleep(1.0)

        try:
            # Stop accepting requests
            for server in servers:
                if hasattr(server, 'close'):  # gevent 1.0
                    server.close()
                if hasattr(server, 'kill'):  # gevent < 1.0
                    server.kill()

            # Handle current requests until graceful_timeout
            ts = time.time()
            while time.time() - ts <= self.cfg.graceful_timeout:
                accepting = 0
                for server in servers:
                    if server.pool.free_count() != server.pool.size:
                        accepting += 1

                # if no server is accepting a connection, we can exit
                if not accepting:
                    return

                self.notify()
                gevent.sleep(1.0)

            # Force kill all active the handlers
            self.log.warning("Worker graceful timeout (pid:%s)" % self.pid)
            for server in servers:
                server.stop(timeout=1)
        except:
            pass
Ejemplo n.º 46
0
 def __init__(self):
     self.mongo_pool = MongoPool()
     self.queue = Queue()
     self.coroutine_pool = Pool()
Ejemplo n.º 47
0
def main_loop(config):
    """
    Основной цикл приложения.

    :param config: конфигурация
    :type config: Config

    Алгоритм:
     * Открываем соединение с tarantool.queue, использую config.QUEUE_* настройки.
     * Создаем пул обработчиков.
     * Создаем очередь куда обработчики будут помещать выполненные задачи.
     * Пока количество обработчиков <= config.WORKER_POOL_SIZE, берем задачу из tarantool.queue
       и запускаем greenlet для ее обработки.
     * Посылаем уведомления о том, что задачи завершены в tarantool.queue.
     * Спим config.SLEEP секунд.
    """
    logger.info('Connect to queue server on {host}:{port} space #{space}.'.format(
        host=config.QUEUE_HOST, port=config.QUEUE_PORT, space=config.QUEUE_SPACE
    ))
    queue = tarantool_queue.Queue(
        host=config.QUEUE_HOST, port=config.QUEUE_PORT, space=config.QUEUE_SPACE
    )

    logger.info('Use tube [{tube}], take timeout={take_timeout}.'.format(
        tube=config.QUEUE_TUBE,
        take_timeout=config.QUEUE_TAKE_TIMEOUT
    ))

    tube = queue.tube(config.QUEUE_TUBE)

    logger.info('Create worker pool[{size}].'.format(size=config.WORKER_POOL_SIZE))
    worker_pool = Pool(config.WORKER_POOL_SIZE)

    processed_task_queue = gevent_queue.Queue()

    logger.info('Run main loop. Worker pool size={count}. Sleep time is {sleep}.'.format(
        count=config.WORKER_POOL_SIZE, sleep=config.SLEEP
    ))

    while run_application:
        free_workers_count = worker_pool.free_count()

        logger.debug('Pool has {count} free workers.'.format(count=free_workers_count))

        for number in xrange(free_workers_count):
            logger.debug('Get task from tube for worker#{number}.'.format(number=number))

            task = tube.take(config.QUEUE_TAKE_TIMEOUT)

            if task:
                logger.info('Start worker#{number} for task id={task_id}.'.format(
                    task_id=task.task_id, number=number
                ))

                worker = Greenlet(
                    notification_worker,
                    task,
                    processed_task_queue,
                    timeout=config.HTTP_CONNECTION_TIMEOUT,
                    verify=False
                )
                worker_pool.add(worker)
                worker.start()

        done_with_processed_tasks(processed_task_queue)

        sleep(config.SLEEP)
    else:
        logger.info('Stop application loop.')
Ejemplo n.º 48
0
    def __init__(self, inventory, config=None):
        # Connection storage
        self.ssh_connections = {}
        self.sftp_connections = {}

        # Private keys
        self.private_keys = {}

        # Facts storage
        self.facts = {}
        self.fact_locks = {}

        # If no config, create one using the defaults
        if config is None:
            config = Config()

        if not config.PARALLEL:
            # If possible run everything in parallel, otherwise the max if defined above
            config.PARALLEL = (min(len(inventory), MAX_PARALLEL)
                               if MAX_PARALLEL is not None else len(inventory))

        # If explicitly set, just issue a warning
        elif MAX_PARALLEL is not None and config.PARALLEL > MAX_PARALLEL:
            logger.warning((
                'Parallel set to {0}, but this may hit the open files limit of {1}.\n'
                '    Max recommended value: {2}').format(
                    config.PARALLEL, nofile_limit, MAX_PARALLEL))

        # Setup greenlet pools
        self.pool = Pool(config.PARALLEL)
        self.fact_pool = Pool(config.PARALLEL)

        # Assign inventory/config
        self.inventory = inventory
        self.config = config

        # Assign self to inventory & config
        inventory.state = config.state = self

        # Host tracking
        self.active_hosts = set()
        self.ready_hosts = set()
        self.connected_hosts = set()

        hostnames = [host.name for host in inventory]

        # Op basics
        self.op_order = []  # list of operation hashes
        self.op_meta = {}  # maps operation hash -> names/etc
        self.ops_run = set()  # list of ops which have been started/run

        # Op dict for each host
        self.ops = {hostname: {} for hostname in hostnames}

        # Meta dict for each host
        self.meta = {
            hostname: {
                'ops': 0,  # one function call in a deploy file
                'commands': 0,  # actual # of commands to run
                'latest_op_hash': None
            }
            for hostname in hostnames
        }

        # Results dict for each host
        self.results = {
            hostname: {
                'ops': 0,  # success_ops + failed ops w/ignore_errors
                'success_ops': 0,
                'error_ops': 0,
                'commands': 0
            }
            for hostname in hostnames
        }

        # Pipeline facts context manager attached to self
        self.pipeline_facts = PipelineFacts(self)
Ejemplo n.º 49
0
# 使用StreamHandler输出到屏幕
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)

logger.addHandler(ch)

# Squid的配置文件语法
# 将请求转发到父代理
# PEER_CONF = "cache_peer %s parent %s 0 no-query weighted-round-robin weight=1 connect-fail-limit=2 allow-miss max-conn=5\n"
PEER_CONF = "cache_peer %s parent %s 0 proxy-only no-query no-digest round-robin connect-fail-limit=10 connect-timeout=15 max-conn=10 name=proxyip-%s\n"

# 可用代理
GOOD_PROXIES = []

pool = Pool(50)


def check_proxy(proxy):
    """验证代理是否可用
    :param proxy list:[ip, port]"""
    global GOOD_PROXIES
    ip, port = proxy
    _proxies = {"http": "{}:{}".format(ip, port)}
    try:
        ip_url = "http://httpbin.org/ip"
        res = requests.get(ip_url, proxies=_proxies, timeout=10)
        assert ip in res.content
        logger.info("[GOOD] - {}:{}".format(ip, port))
        GOOD_PROXIES.append(proxy)
    except Exception as e:
Ejemplo n.º 50
0
class BaseServer(object):
    """An abstract base class that implements some common functionality for the servers in gevent.

    *listener* can either be an address that the server should bind on or a :class:`gevent.socket.socket`
    instance that is already bound (and put into listening mode in case of TCP socket).

    *spawn*, if provided, is called to create a new greenlet to run the handler. By default, :func:`gevent.spawn` is used.

    Possible values for *spawn*:

    * a :class:`gevent.pool.Pool` instance -- *handle* will be executed
      using :meth:`Pool.spawn` method only if the pool is not full.
      While it is full, all the connection are dropped;
    * :func:`gevent.spawn_raw` -- *handle* will be executed in a raw
      greenlet which have a little less overhead then :class:`gevent.Greenlet` instances spawned by default;
    * ``None`` -- *handle* will be executed right away, in the :class:`Hub` greenlet.
      *handle* cannot use any blocking functions as it means switching to the :class:`Hub`.
    * an integer -- a shortcut for ``gevent.pool.Pool(integer)``
    """
    # the number of seconds to sleep in case there was an error in accept() call
    # for consecutive errors the delay will double until it reaches max_delay
    # when accept() finally succeeds the delay will be reset to min_delay again
    min_delay = 0.01
    max_delay = 1

    # Sets the maximum number of consecutive accepts that a process may perform on
    # a single wake up. High values give higher priority to high connection rates,
    # while lower values give higher priority to already established connections.
    # Default is 100. Note, that in case of multiple working processes on the same
    # listening value, it should be set to a lower value. (pywsgi.WSGIServer sets it
    # to 1 when environ["wsgi.multiprocess"] is true)
    max_accept = 100

    _spawn = Greenlet.spawn

    # the default timeout that we wait for the client connections to close in stop()
    stop_timeout = 1

    fatal_errors = (errno.EBADF, errno.EINVAL, errno.ENOTSOCK)

    def __init__(self, listener, handle=None, spawn='default'):
        self._stop_event = Event()
        self._stop_event.set()
        self._watcher = None
        self._timer = None
        self.pool = None
        try:
            self.set_listener(listener)
            self.set_spawn(spawn)
            self.set_handle(handle)
            self.delay = self.min_delay
            self.loop = gevent.get_hub().loop
            if self.max_accept < 1:
                raise ValueError('max_accept must be positive int: %r' %
                                 (self.max_accept, ))
        except:
            self.close()
            raise

    def set_listener(self, listener):
        if hasattr(listener, 'accept'):
            if hasattr(listener, 'do_handshake'):
                raise TypeError(
                    'Expected a regular socket, not SSLSocket: %r' %
                    (listener, ))
            self.family = listener.family
            self.address = listener.getsockname()
            self.socket = listener
        else:
            self.family, self.address = parse_address(listener)

    def set_spawn(self, spawn):
        if spawn == 'default':
            self.pool = None
            self._spawn = self._spawn
        elif hasattr(spawn, 'spawn'):
            self.pool = spawn
            self._spawn = spawn.spawn
        elif isinstance(spawn, (int, long)):
            from gevent.pool import Pool
            self.pool = Pool(spawn)
            self._spawn = self.pool.spawn
        else:
            self.pool = None
            self._spawn = spawn
        if hasattr(self.pool, 'full'):
            self.full = self.pool.full
        if self.pool is not None:
            self.pool._semaphore.rawlink(self._start_accepting_if_started)

    def set_handle(self, handle):
        if handle is not None:
            self.handle = handle
        if hasattr(self, 'handle'):
            self._handle = self.handle
        else:
            raise TypeError("'handle' must be provided")

    def _start_accepting_if_started(self, _event=None):
        if self.started:
            self.start_accepting()

    def start_accepting(self):
        if self._watcher is None:
            # just stop watcher without creating a new one?
            self._watcher = self.loop.io(self.socket.fileno(), 1)
            self._watcher.start(self._do_read)

    def stop_accepting(self):
        if self._watcher is not None:
            self._watcher.stop()
            self._watcher = None
        if self._timer is not None:
            self._timer.stop()
            self._timer = None

    def do_handle(self, *args):
        spawn = self._spawn
        if spawn is None:
            self._handle(*args)
        else:
            spawn(self._handle, *args)

    def _do_read(self):
        for _ in xrange(self.max_accept):
            if self.full():
                self.stop_accepting()
                return
            try:
                args = self.do_read()
                self.delay = self.min_delay
                if not args:
                    return
            except:
                self.loop.handle_error(self, *sys.exc_info())
                ex = sys.exc_info()[1]
                if self.is_fatal_error(ex):
                    self.close()
                    sys.stderr.write('ERROR: %s failed with %s\n' %
                                     (self, str(ex) or repr(ex)))
                    return
                if self.delay >= 0:
                    self.stop_accepting()
                    self._timer = self.loop.timer(self.delay)
                    self._timer.start(self._start_accepting_if_started)
                    self.delay = min(self.max_delay, self.delay * 2)
                break
            else:
                try:
                    self.do_handle(*args)
                except:
                    self.loop.handle_error((args[1:], self), *sys.exc_info())
                    if self.delay >= 0:
                        self.stop_accepting()
                        self._timer = self.loop.timer(self.delay)
                        self._timer.start(self._start_accepting_if_started)
                        self.delay = min(self.max_delay, self.delay * 2)
                    break

    def full(self):
        return False

    def __repr__(self):
        return '<%s at %s %s>' % (type(self).__name__, hex(
            id(self)), self._formatinfo())

    def __str__(self):
        return '<%s %s>' % (type(self).__name__, self._formatinfo())

    def _formatinfo(self):
        if hasattr(self, 'socket'):
            try:
                fileno = self.socket.fileno()
            except Exception:
                ex = sys.exc_info()[1]
                fileno = str(ex)
            result = 'fileno=%s ' % fileno
        else:
            result = ''
        try:
            if isinstance(self.address, tuple) and len(self.address) == 2:
                result += 'address=%s:%s' % self.address
            else:
                result += 'address=%s' % (self.address, )
        except Exception:
            ex = sys.exc_info()[1]
            result += str(ex) or '<error>'
        try:
            handle = getfuncname(self.__dict__['handle'])
        except Exception:
            handle = None
        if handle is not None:
            result += ' handle=' + handle
        return result

    @property
    def server_host(self):
        """IP address that the server is bound to (string)."""
        if isinstance(self.address, tuple):
            return self.address[0]

    @property
    def server_port(self):
        """Port that the server is bound to (an integer)."""
        if isinstance(self.address, tuple):
            return self.address[1]

    def init_socket(self):
        """If the user initialized the server with an address rather than socket,
        then this function will create a socket, bind it and put it into listening mode.

        It is not supposed to be called by the user, it is called by :meth:`start` before starting
        the accept loop."""
        pass

    @property
    def started(self):
        return not self._stop_event.is_set()

    def start(self):
        """Start accepting the connections.

        If an address was provided in the constructor, then also create a socket,
        bind it and put it into the listening mode.
        """
        self.init_socket()
        self._stop_event.clear()
        try:
            self.start_accepting()
        except:
            self.kill()
            raise

    def close(self):
        """Close the listener socket and stop accepting."""
        self._stop_event.set()
        try:
            self.stop_accepting()
        finally:
            try:
                self.socket.close()
            except Exception:
                pass
            finally:
                self.__dict__.pop('socket', None)
                self.__dict__.pop('handle', None)
                self.__dict__.pop('_handle', None)
                self.__dict__.pop('_spawn', None)
                self.__dict__.pop('full', None)
                if self.pool is not None:
                    self.pool._semaphore.unlink(
                        self._start_accepting_if_started)

    def stop(self, timeout=None):
        """Stop accepting the connections and close the listening socket.

        If the server uses a pool to spawn the requests, then :meth:`stop` also waits
        for all the handlers to exit. If there are still handlers executing after *timeout*
        has expired (default 1 second), then the currently running handlers in the pool are killed."""
        self.close()
        if timeout is None:
            timeout = self.stop_timeout
        if self.pool:
            self.pool.join(timeout=timeout)
            self.pool.kill(block=True, timeout=1)

    def serve_forever(self, stop_timeout=None):
        """Start the server if it hasn't been already started and wait until it's stopped."""
        # add test that serve_forever exists on stop()
        if not self.started:
            self.start()
        try:
            self._stop_event.wait()
        finally:
            gevent.spawn(self.stop, timeout=stop_timeout).join()

    def is_fatal_error(self, ex):
        return isinstance(ex, _socket.error) and ex[0] in self.fatal_errors
Ejemplo n.º 51
0
 def __init__(self):
     #创建MongoDB对象
     self.mongo_pool = MongoPool()
     #在init中创建协程池
     self.coroutine_pool = Pool()
Ejemplo n.º 52
0
 def __init__(self, host='127.0.0.1', port= 31337, max_clients=64):
     self._pool = Pool(max_clients)
     self._server = StreamServer((host, port), self.connection_handler, spawn = self._pool)
     self._protocol = ProtocolHandler()
     self._kv = {}
     self._commands = self.get_commands()
Ejemplo n.º 53
0
"""
    协程池
"""
from gevent import monkey
# 打猴子补丁, 让程序在sleep,socket等一些耗时任务的时候, 自动切换
monkey.patch_all()
from gevent.pool import Pool
import time

# 创建协程池对象
p = Pool()


# 定义一个执行任务的方法
def func(msg):
    for i in range(0, 10):
        print(msg)
        time.sleep(1)
        print(i)
        # time.sleep(1)


# 协程池执行异步任务
for i in range(10):
    p.apply_async(func, (f"协程{i}", ))

# 将协程任务加入到主线线程, 让主线程等待协程任务完成
p.join()
Ejemplo n.º 54
0
import argparse
import csv
import math
from operator import itemgetter

import gevent.monkey
from closeio_api import Client as CloseIO_API
from gevent.pool import Pool

gevent.monkey.patch_all()

pool = Pool(7)

parser = argparse.ArgumentParser(
    description=
    'Find duplicate contacts on a lead in your Close org via contact_name, email address, or phone number'
)
parser.add_argument('--api-key', '-k', required=True, help='API Key')
parser.add_argument(
    '--field',
    '-f',
    default='all',
    choices=['contact_name', 'email', 'phone', 'all'],
    required=False,
    help="Specify a field to compare uniqueness",
)
args = parser.parse_args()

# Initialize Close API Wrapper
api = CloseIO_API(args.api_key)
org_name = api.get('me')['organizations'][0]['name'].replace('/', '')
def concurrency(urls):
    ''' Open all the greenlet threads '''
    in_parallel = 100
    pool = Pool(in_parallel)
    jobs = [pool.spawn(action, url) for url in urls]
    return joinall(jobs)
Ejemplo n.º 56
0
class RunSpider(object):
    def __init__(self):
        #创建MongoDB对象
        self.mongo_pool = MongoPool()
        #在init中创建协程池
        self.coroutine_pool = Pool()

    def get_spider_from_setting(self):
        """根据配置文件信息,获取爬虫对象列表"""
        #遍历配置文件中爬虫信息,获取每隔爬虫的全类名
        for full_class_name in PROXIES_SPIDERS:
            #获取模块名和类名
            module_name, class_name = full_class_name.rsplit('.', maxsplit=1)
            #根据模块名导入模块
            module = importlib.import_module(module_name)
            #根据类名,从模块中获取类
            cls = getattr(module, class_name)
            #创建爬虫对象
            spider = cls()

            yield spider

    def run(self):
        #2.1根据配置文件信息,获取爬虫对象列表
        spiders = self.get_spider_from_setting()
        for spider in spiders:
            #2.5处理异常,防止一个爬虫内部出错,影响其他的爬虫
            #3.3 通过异步执行这个方法
            self.coroutine_pool.apply_async(self.__execute_one_spider_tack,
                                            args=(spider, ))
        # 调用协程的join方法,让当前线程等待队列完成
        self.coroutine_pool.join()

    def __execute_one_spider_tack(self, spider):
        #3.2 把处理一个代理爬虫的代码抽到一个方法
        #用于处理一个爬虫任务
        try:
            # 遍历爬虫对象的get_proxies方法,获取代理IP
            for proxy in spider.get_proxies():
                # 监测代理IP
                proxy = check_proxy(proxy)
                # 2.4如果可用,写入数据库
                # 如果spider不为-1,说明可用
                if proxy.speed != -1:
                    # 写入数据库
                    self.mongo_pool.insert_one(proxy)
        except Exception as ex:
            logger.exception(ex)

    @classmethod
    def start(cls):
        """
        使用schedule模块,实现每隔一定时间,执行一次爬取任务
        定义一个start的类方法
        创建当前类的对象,调用run方法
        使用schedule模块,每隔一定的时间,执行当前对象的run方法

        :return:
        """
        rs = RunSpider()
        rs.run()
        schedule.every(RUN_SPIDERS_INTERVAL).hour.do(rs.run)
        while True:
            schedule.run_pending()
            time.sleep(1)
Ejemplo n.º 57
0
class AsyncFormProcessor(object):
    def __init__(self, statedb, migrate_form):
        self.statedb = statedb
        self.migrate_form = migrate_form

    def __enter__(self):
        self.pool = Pool(POOL_SIZE)
        self.queues = PartiallyLockingQueue()
        self.retry = RetryForms(self._try_to_process_form)
        with self.statedb.pop_resume_state(type(self).__name__,
                                           []) as form_ids:
            self._rebuild_queues(form_ids)
        self.stop_status_logger = run_status_logger(
            log_status,
            self.get_status,
            status_interval=1800,  # 30 minutes
        )
        try:
            self._try_to_empty_queues()
        except Exception as err:
            self.__exit__(type(err), err, None)
            raise
        return self

    def __exit__(self, exc_type, exc, exc_tb):
        queue_ids = self.queues.queue_ids + self.retry.form_ids
        try:
            if exc_type is None:
                queue_ids = self._finish_processing_queues()
            else:
                # stop workers -> reduce chaos in logs
                self.pool.kill()
                self.retry.kill()
        finally:
            key = type(self).__name__
            self.statedb.set_resume_state(key, queue_ids)
            log.info("saved %s state (%s ids)", key, len(queue_ids))
            self.stop_status_logger()
            self.queues = self.pool = None

    def _rebuild_queues(self, form_ids):
        for chunk in chunked(form_ids, 100, list):
            for form in FormAccessorCouch.get_forms(chunk):
                self._try_to_process_form(form)

    def process_xform(self, doc):
        """Process XFormInstance document asynchronously"""
        form_id = doc["_id"]
        log.debug('Processing doc: XFormInstance(%s)', form_id)
        if doc.get('problem'):
            if str(doc['problem']).startswith(PROBLEM_TEMPLATE_START):
                doc = _fix_replacement_form_problem_in_couch(doc)
            else:
                self.statedb.add_problem_form(form_id)
                return
        try:
            wrapped_form = XFormInstance.wrap(doc)
        except Exception:
            log.exception("Error migrating form %s", form_id)
            self.statedb.save_form_diffs(doc, {})
        else:
            self._try_to_process_form(wrapped_form)
            self._try_to_empty_queues()

    def _try_to_process_form(self, wrapped_form, retries=0):
        try:
            case_ids = get_case_ids(wrapped_form)
        except Exception as err:
            self.retry.later(wrapped_form, retries + 1, err)
            return
        if self.queues.try_obj(case_ids, wrapped_form):
            self.pool.spawn(self._async_migrate_form, wrapped_form, case_ids)

    @exit_on_error
    def _async_migrate_form(self, wrapped_form, case_ids):
        self.migrate_form(wrapped_form, case_ids)
        self.queues.release_lock(wrapped_form)

    def _try_to_empty_queues(self):
        """Process forms waiting in the queue

        All items in the queue will be processed if the queue becomes
        full. This is done to ensure that no items become perpetually
        stuck in the queue. This may be masking a bug in this class or
        `PartiallyLockingQueue` since the theory of operation should
        prevent starvation. In any case draining the queue periodically
        is a good thing since there is a negative correlation between
        the number of items in the queue and `queue.pop()` performance.
        """
        queue = self.queues
        was_full = queue.full
        while True:
            form, case_ids = queue.pop()
            if form is not None:
                self.pool.spawn(self._async_migrate_form, form, case_ids)
            elif was_full and queue:
                assert queue.processing, "deadlock!"
                wait_for_one_task_to_complete(self.pool)
            else:
                break
        if self.pool:
            gevent.sleep()  # swap greenlets

    def _finish_processing_queues(self):
        update_interval = timedelta(seconds=10)
        next_check = datetime.now()
        pool = self.pool
        while self.queues:
            wrapped_form, case_ids = self.queues.pop()
            if wrapped_form:
                pool.spawn(self._async_migrate_form, wrapped_form, case_ids)
            else:
                gevent.sleep()  # swap greenlets

            now = datetime.now()
            if now > next_check:
                log.info('Waiting on %s docs', len(self.queues) + len(pool))
                next_check += update_interval

        self.retry.join()
        while not pool.join(timeout=10):
            log.info('Waiting on {} docs'.format(len(pool)))

        unprocessed = self.queues.queue_ids + self.retry.form_ids
        if unprocessed:
            log.error("Unprocessed forms (unexpected): %s", unprocessed)
        return unprocessed

    def get_status(self):
        status = self.queues.get_status()
        status["retry"] = len(self.retry)
        return status
Ejemplo n.º 58
0
class BaseServer(object):
    """
    An abstract base class that implements some common functionality for the servers in gevent.

    :param listener: Either be an address that the server should bind
        on or a :class:`gevent.socket.socket` instance that is already
        bound (and put into listening mode in case of TCP socket).

    :keyword handle: If given, the request handler. The request
        handler can be defined in a few ways. Most commonly,
        subclasses will implement a ``handle`` method as an
        instance method. Alternatively, a function can be passed
        as the ``handle`` argument to the constructor. In either
        case, the handler can later be changed by calling
        :meth:`set_handle`.

        When the request handler returns, the socket used for the
        request will be closed.

    :keyword spawn: If provided, is called to create a new
        greenlet to run the handler. By default,
        :func:`gevent.spawn` is used (meaning there is no
        artificial limit on the number of concurrent requests). Possible values for *spawn*:

        - a :class:`gevent.pool.Pool` instance -- ``handle`` will be executed
          using :meth:`gevent.pool.Pool.spawn` only if the pool is not full.
          While it is full, no new connections are accepted;
        - :func:`gevent.spawn_raw` -- ``handle`` will be executed in a raw
          greenlet which has a little less overhead then :class:`gevent.Greenlet` instances spawned by default;
        - ``None`` -- ``handle`` will be executed right away, in the :class:`Hub` greenlet.
          ``handle`` cannot use any blocking functions as it would mean switching to the :class:`Hub`.
        - an integer -- a shortcut for ``gevent.pool.Pool(integer)``

    .. versionchanged:: 1.1a1
       When the *handle* function returns from processing a connection,
       the client socket will be closed. This resolves the non-deterministic
       closing of the socket, fixing ResourceWarnings under Python 3 and PyPy.

    """
    # pylint: disable=too-many-instance-attributes,bare-except,broad-except

    #: the number of seconds to sleep in case there was an error in accept() call
    #: for consecutive errors the delay will double until it reaches max_delay
    #: when accept() finally succeeds the delay will be reset to min_delay again
    min_delay = 0.01
    max_delay = 1

    #: Sets the maximum number of consecutive accepts that a process may perform on
    #: a single wake up. High values give higher priority to high connection rates,
    #: while lower values give higher priority to already established connections.
    #: Default is 100. Note, that in case of multiple working processes on the same
    #: listening value, it should be set to a lower value. (pywsgi.WSGIServer sets it
    #: to 1 when environ["wsgi.multiprocess"] is true)
    max_accept = 100

    _spawn = Greenlet.spawn

    #: the default timeout that we wait for the client connections to close in stop()
    stop_timeout = 1

    fatal_errors = (errno.EBADF, errno.EINVAL, errno.ENOTSOCK)

    def __init__(self, listener, handle=None, spawn='default'):
        self._stop_event = Event()
        self._stop_event.set()
        self._watcher = None
        self._timer = None
        self._handle = None
        # XXX: FIXME: Subclasses rely on the presence or absence of the
        # `socket` attribute to determine whether we are open/should be opened.
        # Instead, have it be None.
        self.pool = None
        try:
            self.set_listener(listener)
            self.set_spawn(spawn)
            self.set_handle(handle)
            self.delay = self.min_delay
            self.loop = get_hub().loop
            if self.max_accept < 1:
                raise ValueError('max_accept must be positive int: %r' %
                                 (self.max_accept, ))
        except:
            self.close()
            raise

    def set_listener(self, listener):
        if hasattr(listener, 'accept'):
            if hasattr(listener, 'do_handshake'):
                raise TypeError(
                    'Expected a regular socket, not SSLSocket: %r' %
                    (listener, ))
            self.family = listener.family
            self.address = listener.getsockname()
            self.socket = listener
        else:
            self.family, self.address = parse_address(listener)

    def set_spawn(self, spawn):
        if spawn == 'default':
            self.pool = None
            self._spawn = self._spawn
        elif hasattr(spawn, 'spawn'):
            self.pool = spawn
            self._spawn = spawn.spawn
        elif isinstance(spawn, integer_types):
            from gevent.pool import Pool
            self.pool = Pool(spawn)
            self._spawn = self.pool.spawn
        else:
            self.pool = None
            self._spawn = spawn
        if hasattr(self.pool, 'full'):
            self.full = self.pool.full
        if self.pool is not None:
            self.pool._semaphore.rawlink(self._start_accepting_if_started)

    def set_handle(self, handle):
        if handle is not None:
            self.handle = handle
        if hasattr(self, 'handle'):
            self._handle = self.handle
        else:
            raise TypeError("'handle' must be provided")

    def _start_accepting_if_started(self, _event=None):
        if self.started:
            self.start_accepting()

    def start_accepting(self):
        if self._watcher is None:
            # just stop watcher without creating a new one?
            self._watcher = self.loop.io(self.socket.fileno(), 1)
            self._watcher.start(self._do_read)

    def stop_accepting(self):
        if self._watcher is not None:
            self._watcher.stop()
            self._watcher = None
        if self._timer is not None:
            self._timer.stop()
            self._timer = None

    def do_handle(self, *args):
        spawn = self._spawn
        handle = self._handle
        close = self.do_close

        try:
            if spawn is None:
                _handle_and_close_when_done(handle, close, args)
            else:
                spawn(_handle_and_close_when_done, handle, close, args)
        except:
            close(*args)
            raise

    def do_close(self, *args):
        pass

    def do_read(self):
        raise NotImplementedError()

    def _do_read(self):
        for _ in xrange(self.max_accept):
            if self.full():
                self.stop_accepting()
                return
            try:
                args = self.do_read()
                self.delay = self.min_delay
                if not args:
                    return
            except:
                self.loop.handle_error(self, *sys.exc_info())
                ex = sys.exc_info()[1]
                if self.is_fatal_error(ex):
                    self.close()
                    sys.stderr.write('ERROR: %s failed with %s\n' %
                                     (self, str(ex) or repr(ex)))
                    return
                if self.delay >= 0:
                    self.stop_accepting()
                    self._timer = self.loop.timer(self.delay)
                    self._timer.start(self._start_accepting_if_started)
                    self.delay = min(self.max_delay, self.delay * 2)
                break
            else:
                try:
                    self.do_handle(*args)
                except:
                    self.loop.handle_error((args[1:], self), *sys.exc_info())
                    if self.delay >= 0:
                        self.stop_accepting()
                        self._timer = self.loop.timer(self.delay)
                        self._timer.start(self._start_accepting_if_started)
                        self.delay = min(self.max_delay, self.delay * 2)
                    break

    def full(self):
        # copied from self.pool
        # pylint: disable=method-hidden
        return False

    def __repr__(self):
        return '<%s at %s %s>' % (type(self).__name__, hex(
            id(self)), self._formatinfo())

    def __str__(self):
        return '<%s %s>' % (type(self).__name__, self._formatinfo())

    def _formatinfo(self):
        if hasattr(self, 'socket'):
            try:
                fileno = self.socket.fileno()
            except Exception as ex:
                fileno = str(ex)
            result = 'fileno=%s ' % fileno
        else:
            result = ''
        try:
            if isinstance(self.address, tuple) and len(self.address) == 2:
                result += 'address=%s:%s' % self.address
            else:
                result += 'address=%s' % (self.address, )
        except Exception as ex:
            result += str(ex) or '<error>'

        handle = self.__dict__.get('handle')
        if handle is not None:
            fself = getattr(handle, '__self__', None)
            try:
                if fself is self:
                    # Checks the __self__ of the handle in case it is a bound
                    # method of self to prevent recursivly defined reprs.
                    handle_repr = '<bound method %s.%s of self>' % (
                        self.__class__.__name__,
                        handle.__name__,
                    )
                else:
                    handle_repr = repr(handle)

                result += ' handle=' + handle_repr
            except Exception as ex:
                result += str(ex) or '<error>'

        return result

    @property
    def server_host(self):
        """IP address that the server is bound to (string)."""
        if isinstance(self.address, tuple):
            return self.address[0]

    @property
    def server_port(self):
        """Port that the server is bound to (an integer)."""
        if isinstance(self.address, tuple):
            return self.address[1]

    def init_socket(self):
        """If the user initialized the server with an address rather than socket,
        then this function will create a socket, bind it and put it into listening mode.

        It is not supposed to be called by the user, it is called by :meth:`start` before starting
        the accept loop."""
        pass

    @property
    def started(self):
        return not self._stop_event.is_set()

    def start(self):
        """Start accepting the connections.

        If an address was provided in the constructor, then also create a socket,
        bind it and put it into the listening mode.
        """
        self.init_socket()
        self._stop_event.clear()
        try:
            self.start_accepting()
        except:
            self.close()
            raise

    def close(self):
        """Close the listener socket and stop accepting."""
        self._stop_event.set()
        try:
            self.stop_accepting()
        finally:
            try:
                self.socket.close()
            except Exception:
                pass
            finally:
                self.__dict__.pop('socket', None)
                self.__dict__.pop('handle', None)
                self.__dict__.pop('_handle', None)
                self.__dict__.pop('_spawn', None)
                self.__dict__.pop('full', None)
                if self.pool is not None:
                    self.pool._semaphore.unlink(
                        self._start_accepting_if_started)

    @property
    def closed(self):
        return not hasattr(self, 'socket')

    def stop(self, timeout=None):
        """
        Stop accepting the connections and close the listening socket.

        If the server uses a pool to spawn the requests, then
        :meth:`stop` also waits for all the handlers to exit. If there
        are still handlers executing after *timeout* has expired
        (default 1 second, :attr:`stop_timeout`), then the currently
        running handlers in the pool are killed.

        If the server does not use a pool, then this merely stops accepting connections;
        any spawned greenlets that are handling requests continue running until
        they naturally complete.
        """
        self.close()
        if timeout is None:
            timeout = self.stop_timeout
        if self.pool:
            self.pool.join(timeout=timeout)
            self.pool.kill(block=True, timeout=1)

    def serve_forever(self, stop_timeout=None):
        """Start the server if it hasn't been already started and wait until it's stopped."""
        # add test that serve_forever exists on stop()
        if not self.started:
            self.start()
        try:
            self._stop_event.wait()
        finally:
            Greenlet.spawn(self.stop, timeout=stop_timeout).join()

    def is_fatal_error(self, ex):
        return isinstance(ex,
                          _socket.error) and ex.args[0] in self.fatal_errors
Ejemplo n.º 59
0
def test_proxies(proxies,
                 timeout=10,
                 single_url=None,
                 many_urls=None,
                 call_back=None):
    """
    Test proxies, or process html source using callback in the meantime.

    :type proxies: list
    :param proxies:  proxies
    :param timeout: response timeout
    :param single_url: The URL for testing
    :param many_urls: The list of URLs for testing. Pick one of them when perform request.
    :param call_back: Process the html source if status code is 200. callback(url, source)
    :return:
    """

    proxies = set(proxies)
    errors = set()
    pool = Pool(100)

    def test(proxy):
        code = None
        url = random.choice(many_urls) if many_urls is not None else single_url

        start_time = time.time()
        try:
            with gevent.Timeout(seconds=timeout,
                                exception=Exception('[Connection Timeout]')):
                _headers['User-Agent'] = random.choice(_user_agents)

                res = requests.get(url,
                                   proxies={
                                       'http':
                                       'http://{}'.format(proxy.strip()),
                                       'https':
                                       'https://{}'.format(proxy.strip())
                                   },
                                   headers=_headers)
                code = res.status_code
                source = res.text

            _log('[Proxy: {:d} {:s}]'.format(code, proxy))

            # 回调
            if source is not None and call_back is not None and code == 200:
                call_back(url, source)

            if code != 200:
                errors.add(proxy)

        except Exception as e:
            # log(e.args)
            errors.add(proxy)

        end_time = time.time()
        escaped = end_time - start_time if code else None

        store_in_db(proxy, escaped=escaped, status_code=code)  # store in db

    for proxy in proxies:
        pool.spawn(test, proxy)
    pool.join()

    proxies = proxies - errors
    _log('[HTTP Proxies] Available:{:d} Deprecated:{:d}'.format(
        len(proxies), len(errors)))

    return list(proxies)
Ejemplo n.º 60
0
 def __init__(self):
     self.pool = Pool(1000)
     self.pool.start()