Esempio n. 1
0
 def execute_all_nodes(self, func, nodes):
     """
     execute func on all the nodes
     """
     g = Group()
     g.map(func, nodes)
     g.join()
Esempio n. 2
0
def sample_no_result():
    group = Group()

    def hello_from(n):
        print 'Size of group %s' % len(group)
        print 'Hello %s from Greenlet %s' % (n, id(getcurrent()))

    group.map(hello_from, xrange(3))
Esempio n. 3
0
File: app.py Progetto: kyeah/gspot
def main():

    g = login_google()
    s = login_spotify()

    # Filter playlists by config and last sync
    g.playlists = [p for p in g.playlists
                   if (not config.playlists or 
                       p['name'] in config.playlists)
                   and p['name'] not in config.exclude
                   and float(p['lastModifiedTimestamp']) > float(config.since)]

    # Transfer playlists
    tasks = [(g, s, playlist) for playlist in g.playlists]
    pool = Group()
    pool.map(lambda args: transfer_playlist(*args), tasks)
def get_conns(cred, providers):
    """Collect node data asynchronously using gevent lib."""
    cld_svc_map = {
        "aws": conn_aws,
        "azure": conn_az,
        "gcp": conn_gcp,
        "alicloud": conn_ali
    }
    sys.stdout.write("\rEstablishing Connections:  ")
    sys.stdout.flush()
    busy_obj = busy_disp_on()
    conn_fn = [[cld_svc_map[x.rstrip('1234567890')], cred[x], x]
               for x in providers]
    cgroup = Group()
    conn_res = []
    conn_res = cgroup.map(get_conn, conn_fn)
    cgroup.join()
    conn_objs = {}
    for item in conn_res:
        conn_objs.update(item)
    busy_disp_off(dobj=busy_obj)
    sys.stdout.write("\r                                                 \r")
    sys.stdout.write("\033[?25h")  # cursor back on
    sys.stdout.flush()
    return conn_objs
Esempio n. 5
0
    def execute_all_nodes(self, func, nodes=None):
        """
        execute func on all the nodes

        if nodes is None, func is execute on all the nodes that play a role with the minio
        deployement, if nodes is not None, it needs to be an iterable containing a node object

        :param func: function to execute, func needs to accept one argument, a node object
        :type func: function
        :param nodes: list of node on whic to execute func, defaults to None
        :param nodes: iterable, optional
        """

        if nodes is None:
            nodes = set([self.vm_node, self.vm_host])
            nodes.update(self.zerodb_nodes)

        g = Group()
        g.map(func, nodes)
        g.join()
Esempio n. 6
0
    def getStories(self):
        """
        return a list of story dicts
        """

        logger.debug('getStories: about to get {0} stories'.format(len(self.storyids)))

        group = Group()
        getstory = lambda storyid: firebase.get('/v0/item', storyid)

        stories = group.map(getstory, self.storyids)

        return stories
Esempio n. 7
0
def group_parallelism():
    group = Group()

    def hello_from(n):
        print("Size of group %s" % len(group))
        print("Hello from Greenlet %s" % id(getcurrent()))

    group.map(hello_from, range(3))

    def intensive(n):
        gevent.sleep(3 - n)
        return "task", n

    print("Ordered")

    ogroup = Group()
    for i in ogroup.imap(intensive, range(3)):
        print(i)

    print("Unordered")

    igroup = Group()
    for i in igroup.imap_unordered(intensive, range(3)):
        print(i)
Esempio n. 8
0
    def test_parallel_queue_declare(self):

        conn = Connection(self.amqp_url)
        conn.connect()

        channel = conn.allocate_channel()

        def declare(name):
            return channel.queue_declare(queue=name)

        g = Group()
        res = g.map(declare, queues)

        assert len(res) == len(queues)
        assert all(isinstance(r, FrameQueueDeclareOk) for r in res)
Esempio n. 9
0
def get_page_info(queue, page_url_infos):
    try:
        product_urls = []
        page_url = page_url_infos.values()[0]
        category = page_url_infos.keys()[0]
        res = s.get(page_url, headers=header_get)
        if res.status_code == 200:
            page_source = res.text.encode('utf-8')
        else:
            logger.error('page_url:{} get status error:{}'.format(page_url, res.status_code))
            raise FlipkartException('get status code:{} error'.format(res.status_code))
        pattern = re.compile(r'window.__INITIAL_STATE__ [\s\S]+?</script>', re.S)
        init_state = pattern.findall(page_source)
        source_info = init_state[0]
        # if source_info.find('"serverErrorMessage":"Please try again later"') != -1:
        #     raise Flipkart500Exception
        source_info = source_info[26:-11]
        source_info = json.loads(source_info)
        data_infos = source_info['pageDataV4']['page']
        if data_infos['asyncStatus'] != 'SUCCESS':
            # logger.error('page_url:{} asyucstatus:{} '.format(page_url, data_infos['asyncStatus']))
            raise Flipkart500Exception
        data_infos = data_infos['data']['10003'][1:-1]
        for data_info in data_infos:
            products = data_info['widget']['data']['products']
            for product in products:
                product_infos = {}
                base_url = product['productInfo']['value']['baseUrl']
                product_url = urljoin(flipkart_url, base_url)
                product_infos[category] = product_url
                product_urls.append(product_infos)
        group = Group()
        page_detail_infos = group.map(get_detail_info, product_urls)
        page_detail_infos = filter(check_null, page_detail_infos)
        if page_detail_infos:
            queue.put(page_detail_infos)
    except Exception, e:
        # if not isinstance(e, Flipkart500Exception):
        # logger.error('get_page_info page_url:{} error:{}'.format(page_url, e))
        #     logger.error("get_page_info page_source:{}".format(page_source))
        raise
Esempio n. 10
0
File: app.py Progetto: kyeah/gspot
def transfer_playlist(g, s, playlist):
    """ Synchronize Google Music playlist to Spotify """
    
    # Retrieve or create associated Spotify playlist
    name = playlist['name']
    spotlist = s.playlists.get(name, None) \
               or s.user_playlist_create(s.username, name)

    action = "Updating" if name in s.playlists else "Creating"
    log.info("%s playlist '%s'" % (action, name))

    # Find Spotify track IDs for each new song
    tasks = [(g, s, track) for track in playlist['tracks']
             if float(track['creationTimestamp']) > float(config.since)]

    pool = Group()
    results = pool.map(lambda args: find_track_id(*args), tasks)

    track_ids, not_found = [], []
    for (ok, track_info) in results:
        (track_ids if ok else not_found).append(track_info)

    for nf in not_found:
        log.warning("Track not found for '%s': '%s'" % (name, nf))

    # Filter for songs not yet synchronized to Spotify
    spotlist_info = s.user_playlist(s.username, playlist_id=spotlist['id'], fields="tracks,next")

    tracks = spotlist_info['tracks']
    spotlist_tracks = [x['track']['id'] for x in tracks['items']]
    while tracks['next']:
        tracks = s.next(tracks)
        spotlist_tracks += [x['track']['id'] for x in tracks['items']]

    new_ids = [x for x in track_ids if x not in spotlist_tracks]

    # Add new songs!!!
    log.info("Adding %d new tracks to '%s'!!!!!!" % (len(new_ids), name))
    for group in chunker(new_ids, 100):
        s.user_playlist_add_tracks(s.username, spotlist['id'], group)
Esempio n. 11
0
def main(url):
    page = fetch(url)
    if not url.startswith('http'):
        url = 'http://' + url

    print url

    if not page:
        return

    soup = BeautifulSoup.BeautifulSoup(page)
    #get all hosts
    groupview = get(soup, 'ul', {'class':'groupview'})
    group = groupview[0]
    host = get(group, 'span', {'class': 'host'})
    host_url = []
    for h in host:
        host_url.append(h.contents[0].get('href'))

    print "There are {0} hosts.".format(len(host_url))

    group = Group()
    bad_img = []
    for hu in host_url:
        print "Checking on {0}".format(hu)
        host_page = fetch(os.path.join(url, hu))
        host_page_soup = BeautifulSoup.BeautifulSoup(host_page)
        imgs = get(host_page_soup, 'img', {'class':'i'})
        print "There are {0} urls.".format(len(imgs))

        img_url = [ os.path.join(url, img.get('src').replace('../','')) for img in imgs ]
        print "There are {0} urls will be checked.".format(len(img_url))
        fetch_result = group.map(fetch_gevent, img_url)
        fetch_result = filter(lambda x: x, fetch_result)
        bad_img.extend(fetch_result)

    if bad_img:
        print "The following images do not exits:"
    for bi in bad_img:
        print bi
def get_data(conn_objs, providers):
    """Refresh node data using existing connection-objects."""
    cld_svc_map = {
        "aws": nodes_aws,
        "azure": nodes_az,
        "gcp": nodes_gcp,
        "alicloud": nodes_ali
    }
    sys.stdout.write("\rCollecting Info:  ")
    sys.stdout.flush()
    busy_obj = busy_disp_on()
    collec_fn = [[cld_svc_map[x.rstrip('1234567890')], conn_objs[x]]
                 for x in providers]
    ngroup = Group()
    node_list = []
    node_list = ngroup.map(get_nodes, collec_fn)
    ngroup.join()
    busy_disp_off(dobj=busy_obj)
    sys.stdout.write("\r                                                 \r")
    sys.stdout.write("\033[?25h")  # cursor back on
    sys.stdout.flush()
    return node_list
Esempio n. 13
0
"""
    using group.map to get result.
    result from imap is in order(add order), imap_unordered has no order
    Author: xiangtian.hu
    Date: 2017-8-4
"""
from gevent import getcurrent
from gevent.pool import Group
group = Group()


def hello_from(n):
    print('Size of group %s' % len(group))
    print('Hello from Greenlet %s' % id(getcurrent()))
    return n

# Could use "imap" replace of map, imap return a iterable
x = group.map(hello_from, range(3))
print(type(x))
print(x)
Esempio n. 14
0
class ProxyPool(object):
    """Proxy Pool.
    """
    def __init__(self, configfile='settings.yaml'):
        self.configs     = self._get_configs(configfile)
        self.rdb         = redis.StrictRedis(db=self.configs['RDB'])
        self.db_zproxy   = self.configs['DB_ZPROXY']
        self.pool        = Pool(self.configs['MAX_CONCURRENCY'])
        self.group       = Group()
        self.wrong_value = self.configs['WRONG_VALUE']
        self.init_value  = self.configs['INIT_VALUE']

    def _get_configs(self, configfile):
        """Return the configuration dict"""
        # XXX: getting the path of the configuraion file needs improving
        configpath = os.path.join('.', configfile)
        with open(configpath, 'rb') as fp:
            configs = yaml.load(fp)

        return configs

    def get_many(self, num=3, minscore=0, maxscore=None):
        """
        Return a list of proxies including at most 'num' proxies
        which socres are between 'minscore' and 'mascore'.
        If there's no proxies matching, return an empty list.
        """
        minscore = minscore
        maxscore = maxscore or self.init_value
        res = self.rdb.zrange(self.db_zproxy, minscore, maxscore)
        if res:
            random.shuffle(res) # for getting random results
            if len(res) < num:
                logging.warning("The number of proxies you want is less than %d"
                                % (num,))
            return [proxy for proxy in res[:num]]
        else:
            logging.warning("There're no proxies which scores are between %d and %d"
                            % (minscore, maxscore))
            return []

    def get_one(self, minscore=0, maxscore=None):
        """
        Return one proxy which score is between 'minscore'
        and 'maxscore'.
        If there's no proxy matching, return an empty string.
        """
        minscore = minscore
        maxscore = maxscore or self.init_value
        res = self.get_many(num=1, minscore=minscore, maxscore=maxscore)
        if res:
            return res[0]
        else:
            return ''

    def crawl_proxies(self):
        """Get proxies from vairous methods.
        """
        statics = []
        self._crawl_proxies_sites(statics=statics)
        logging.info('Having add %d proxies' % (len(statics),))

    def _crawl_proxies_sites(self, statics=[]):
        """Get proxies from web pages."""
        args = ((url, val['rules'], val['proxies'], statics)
                for url, val in self.configs['PROXY_SITES'].iteritems())
        self.pool.map(self._crawl_proxies_one_site, args)

    def _crawl_proxies_one_site(self, args):
        """Get proxies (ip:port) from url and then write them into redis."""
        url             = args[0]
        rules           = args[1]
        proxies         = args[2]
        headers         = self.configs['HEADERS']
        headers['Host'] = url.split('/', 3)[2]
        logging.info('Begin crawl page %s' % (url,))
        res             = requests.get(url, headers=headers, proxies=proxies)
        encoding        = res.encoding
        html            = etree.HTML(res.content)
        proxies         = []
        len_rules       = len(rules)

        nodes = html.xpath(rules[0])
        if nodes:
            if len_rules == 1:
                for node in nodes:
                    text = str(node.text).encode(encoding).strip()
                    if text:
                        proxies.append('http://%s' % (text,))
            elif len_rules == 2:
                rule_1 = rules[1].split(',')
                for node in nodes:
                    node = node.xpath(rule_1[0])
                    ip = str(node[1].text).encode(encoding).strip()
                    port = str(node[2].text).encode(encoding).strip() or '80'
                    if ip:
                        proxies.append('http://%s:%s' % (ip, port))

        for proxy in proxies:
            logging.info('Get proxy %s from %s' % (proxy, url))
            args[3].append(proxy)
            self.rdb.zadd(self.db_zproxy, self.init_value, proxy)

    def validate_proxies(self):
        """Validate whether the proxies are alive."""
        maxscore       = self.init_value + self.wrong_value
        proxies        = self.rdb.zrange(self.db_zproxy, 0, maxscore)
        statics_errors = []
        args           = [(proxy, statics_errors) for proxy in proxies]
        self.group.map(self._validate_one_proxy, args)

        logging.info('Have validated %d proxies, %d errors happened.'
                     % (len(proxies), len(statics_errors)))

    def _validate_one_proxy(self, args):
        """Validate whether the proxy is still alive."""
        test_sites = self.configs['TEST_SITES']
        proxy      = args[0]
        time_res   = []
        args       = [(site, proxy, time_res, args[1]) for site in test_sites]
        self.group.map(self._test_one_site, args)

        mean_time = sum(time_res) / len(test_sites)
        logging.info('Validating %s, score is %d' % (proxy, mean_time))
        self.rdb.zadd(self.db_zproxy, mean_time, proxy)

    def _test_one_site(self, args):
        url             = args[0]        
        headers         = self.configs['HEADERS']
        headers['Host'] = url.split('/', 3)[2]
        proxy           = args[1]
        proxies         = {'http': proxy,}
        start_time      = time.time()
        timeout         = self.configs['TEST_TIMEOUT']
        error           = False
        with Timeout(timeout, False):
            try:
                res = requests.get(url, headers=headers, proxies=proxies)
            except Exception as e:
                error = True
                logging.error('%s: Error: %s' % (proxy, e.message))

        if error:
            args[2].append(self.configs['WRONG_VALUE'])
            args[3].append(proxy)
        else:
            args[2].append(time.time() - start_time)
Esempio n. 15
0
import gevent
from gevent import getcurrent
from gevent.pool import Group
from random import randint

group = Group()
limit = 10

def hello_from(n):
    print('Size of group %s' % len(group))
    print('Hello from Greenlet %s' % id(getcurrent()))

group.map(hello_from, xrange(limit))

def intensive(n):
    gevent.sleep(randint(1, limit - n))
    return 'task', n

print('Ordered')

ogroup = Group()
for i in ogroup.imap(intensive, xrange(limit)):
    print(i)

print('Unordered')

igroup = Group()
for i in igroup.imap_unordered(intensive, xrange(limit)):
    print(i)
Esempio n. 16
0
class Crawler(object):
    def __init__(self, seed, port='30000', **kwargs):
        self.result = {}
        self.probed = set()
        self.task_lst = Queue()
        self.pool = Group()
        self.conf = kwargs

        addr = seed.split(':')  ### Invalid the seed if conf NOT set
        if len(
                addr
        ) == 1:  ### if seed str not contain port, use conf['port'] or default
            addr += [port]
        elif len(addr) > 2:
            ### TODO: Usage
            sys.stderr.write('Invalid seed %s\n' % seed)
            exit(22)

        #self.task_lst.put_nowait(dict(Host=':'.join(addr)))
        sys.stderr.write('Crawler start from %s\n' % (':'.join(addr)))

    def req(self,
            ip,
            port=30003,
            apiId='1',
            apiVer='3.0',
            method='getchordringinfo',
            params={},
            timeout=20,
            **kwargs):
        r = ''
        ret = {}
        d = dict(id=apiId, jsonrpc=apiVer, method=method, params=params)
        try:
            r = requests.post('http://%s:%s' % (ip, port),
                              json=d,
                              headers={'Content-Type': 'application/json'},
                              timeout=timeout)
            ret = json.loads(r.text)
        except Exception as e:
            sys.stderr.write(
                '%s: met Error [%s] when request [%s] from %s:%s resp [%s]\n' %
                (time.strftime('%F %T'), e.message, method, ip, port,
                 r.text if r else ''))
            raise e
        return ret

    def parse(self, resp):
        succ_lst = []
        for vn in resp.get('result', {}).get('Vnodes', []):
            ### new discovery
            succ_lst += [n for n in vn.pop('Successors', []) if n]
            succ_lst += [n for n in vn.pop('Finger', []) if n]
            succ_lst += [vn.pop('Predecessor') or {}]
        return succ_lst

    def info_from_task(self, task):
        ip, port = task.get('Host', '').split(':')
        return task.get('Id', ''), ip, int(port) + 3

    def task_to_node(self, task):
        return task

    def worker(self, timeout=20):
        while True:
            try:
                t = self.task_lst.get(timeout=timeout)
                Id, ip, port = self.info_from_task(t)

                if ip and port and Id not in self.probed:  ### Is valid task and Not crawl yet
                    self.probed.add(
                        Id)  ### mark it as crawled whatever success or fail
                    new_nodes = self.parse(self.req(ip, port, **self.conf))
                    self.result[Id] = self.task_to_node(
                        t)  ### Add to crawl result
                    [self.task_lst.put_nowait(n)
                     for n in new_nodes]  ### add new_nodes into task_lst
            except Empty as e:
                sys.stderr.write('%s: worker exit due to err %s\n' %
                                 (time.strftime('%F %T'), type(e)))
                break
            except Exception as e:
                sys.stderr.write('%s: worker req %s met err %s\n' %
                                 (time.strftime('%F %T'), str(t), type(e)))
                # print traceback.format_exc(e) ### stay for debug
                continue

    def debug(self, interval=5):
        while True:
            sys.stderr.write('Craw results %d\n' % len(self.result))
            gevent.sleep(interval)

    def run(self, timeout=20, thread=1, **kwargs):
        gevent.spawn(self.debug, 5)
        self.pool.map(self.worker, [timeout] * int(thread))
        self.pool.join()
        sys.stderr.write('Total: %d Nodes\n' % len(self.result))
Esempio n. 17
0
import gevent
from gevent.pool import Group
from gevent import getcurrent

group = Group()


def hello_from(n):
    print('size of group:%s' % len(group))
    print('%s' % id(getcurrent))


group.map(hello_from, range(0, 2))


def intensive(n):
    gevent.sleep(3 - n)
    return 'task', n


print('Ordered')

ogroup = Group()
for i in ogroup.imap(intensive, range(0, 3)):
    print(i)

print('Unordered')

igroup = Group()
for i in igroup.imap_unordered(intensive, range(0, 3)):
    print(i)
Esempio n. 18
0
def async_map(f, iterable):
    group = Group()
    return group.map(f, iterable)
Esempio n. 19
0
File: PaaS.py Progetto: Tefx/meerkat
class PaaS(Platform):
    class CleanAction:
        Null = "none"
        Stop = "stop"
        Terminate = "terminate"

    def __init__(self, requests, clean_action=CleanAction.Stop, **options):
        super().__init__(**options)
        self.VMs = []
        self.requests = requests
        self.clean_action = clean_action
        self.pending_lets = Group()

    def VMs_on_platform(self):
        raise NotImplementedError

    def launch_VMs(self, vm_type, vm_num):
        raise NotImplementedError

    def VM_is_ready(self, vm):
        raise NotImplementedError

    def service_on_VM(self, vm):
        raise NotImplementedError

    def clean_VM(self, vm):
        if self.clean_action != self.CleanAction.Null:
            raise NotImplementedError

    def prepare_VMs(self):
        provisioning_requests = copy(self.requests)
        logger.info("[%s.prepare_VMs]: Requesting %s", self.__class__.__name__,
                    provisioning_requests)
        for vm in self.VMs:
            if provisioning_requests.get(vm.instance_type) > 0:
                provisioning_requests[vm.instance_type] -= 1
            else:
                self.clean_VM(vm)
        logger.info("[%s.prepare_VMs]: Not connected %s",
                    self.__class__.__name__, provisioning_requests)
        for vm in self.VMs_on_platform():
            if vm not in self.VMs and provisioning_requests.get(
                    vm.instance_type) > 0:
                provisioning_requests[vm.instance_type] -= 1
                self.VMs.append(vm)
                if vm.state["Name"] == "stopped":
                    vm.start()
        logger.info("[%s.prepare_VMs]: New launch %s", self.__class__.__name__,
                    provisioning_requests)
        for vm_type, vm_num in provisioning_requests.items():
            if vm_num:
                self.VMs.extend(self.launch_VMs(vm_type, vm_num))
        return self.VMs

    def create_service(self, vm, options):
        while not self.VM_is_ready(vm):
            gevent.sleep(PLATFORM_PAAS_VM_WAIT_INTERVAL)
        service = self.service_on_VM(vm)
        service.set_options(
            dict(retry_ssh=PLATFORM_PAAS_SSH_RETRIES,
                 retry_ssh_interval=PLATFORM_PAAS_SSH_RETRY_INTERVAL), options,
            self.options)
        service.prepare_workers()
        self.services.append(service)

    def prepare_services(self, options):
        for vm in self.prepare_VMs():
            self.pending_lets.spawn(self.create_service, vm, options)

    def clean(self):
        self.pending_lets.join()
        call_on_each(self.services, "clean", join=True)
        self.pending_lets.map(self.clean_VM, self.VMs)
        self.pending_lets.join()
Esempio n. 20
0
 def remove_fileno_resources(self, fileno):
     m = [(getattr(self, n), fileno) for n in store]
     group = Group()
     group.map(clean_dict, m)
     del m[:]
     del m
Esempio n. 21
0
# 组和池
# test gevent7  gevent.pool.Group()  01
# ############################################################
import gevent
from gevent import getcurrent
from gevent.pool import Group

group = Group()


def hello_from(n):
    print "Size of group %s" % len(group)
    print "hello from greenlet %s" % id(getcurrent())


group.map(hello_from, xrange(3))


def intensive(n):
    gevent.sleep(3 - n)
    return "task", n


print "Ordered"

ogroup = Group()
for i in ogroup.imap(intensive, xrange(3)):
    print i

print "Unordered"
Esempio n. 22
0
# coding: utf-8
# Last modified: 2014 Jun 25 01:31:21 PM
# xh

import gevent
from gevent import getcurrent
from gevent.pool import Group

group = Group()

def hello_from(n):
    print('Size of group %s' % len(group))
    print('Hello from Greenlet %s' % id(getcurrent()))

group.map(hello_from, xrange(3))

def intensive(n):
    gevent.sleep(3 - n)
    return 'task', n

print('Ordered')

ogroup = Group()
for i in ogroup.imap(intensive, xrange(3)):
    print(i)

print('Unordered')
igroup = Group()
for i in igroup.imap_unordered(intensive, xrange(3)):
    print(i)
Esempio n. 23
0
# encoding=utf8

import gevent
from gevent import getcurrent
from gevent.pool import Group

group = Group()

def hello_from(n):
    print 'Size of group: %s' % len(group)
    print 'Hello from Greenlet %s' % id(getcurrent())

print type(group.map(hello_from, xrange(3)))

def intensive(n):
    gevent.sleep(3 - n)
    return 'task', n

print 'Ordered'

ogroup = Group()
for i in ogroup.imap(intensive, xrange(3)):
    print i

print 'Unordered'

igroup = Group()
for i in igroup.imap_unordered(intensive, xrange(3)):
    print i

print dir(igroup)
Esempio n. 24
0
import gevent
from gevent import getcurrent
from gevent.pool import Group


group = Group()



def hello_from(n):
    print('Size of group', len(group))
    print('Hello from Greenlet %s' % id(getcurrent()))  #获取当前gevent实例的id



group.map(hello_from, xrange(3)) #map迭代方法,参数为方法和其参数



def intensive(n):
    gevent.sleep(3 - n)
    return 'task', n



print('Ordered')



ogroup = Group()
for i in ogroup.imap(intensive, xrange(3)):  #相当于 itertools.imap,返回一个迭代器, 它是调用了一个其值在输入迭代器上的函数, 返回结果. 它类似于函数 map() , 只是前者在
Esempio n. 25
0
class ProxyPool(object):
    """Proxy Pool.
    """
    def __init__(self, configfile='settings.yaml'):
        self.configs = self._get_configs(configfile)
        self.rdb = redis.StrictRedis(db=self.configs['RDB'])
        self.db_zproxy = self.configs['DB_ZPROXY']
        self.pool = Pool(self.configs['MAX_CONCURRENCY'])
        self.group = Group()
        self.wrong_value = self.configs['WRONG_VALUE']
        self.init_value = self.configs['INIT_VALUE']

    def _get_configs(self, configfile):
        """Return the configuration dict"""
        # XXX: getting the path of the configuraion file needs improving
        configpath = os.path.join('.', configfile)
        with open(configpath, 'rb') as fp:
            configs = yaml.load(fp)

        return configs

    def get_many(self, num=3, minscore=0, maxscore=None):
        """
        Return a list of proxies including at most 'num' proxies
        which socres are between 'minscore' and 'mascore'.
        If there's no proxies matching, return an empty list.
        """
        minscore = minscore
        maxscore = maxscore or self.init_value
        res = self.rdb.zrange(self.db_zproxy, minscore, maxscore)
        if res:
            random.shuffle(res)  # for getting random results
            if len(res) < num:
                logging.warning(
                    "The number of proxies you want is less than %d" % (num, ))
            return [proxy for proxy in res[:num]]
        else:
            logging.warning(
                "There're no proxies which scores are between %d and %d" %
                (minscore, maxscore))
            return []

    def get_one(self, minscore=0, maxscore=None):
        """
        Return one proxy which score is between 'minscore'
        and 'maxscore'.
        If there's no proxy matching, return an empty string.
        """
        minscore = minscore
        maxscore = maxscore or self.init_value
        res = self.get_many(num=1, minscore=minscore, maxscore=maxscore)
        if res:
            return res[0]
        else:
            return ''

    def crawl_proxies(self):
        """Get proxies from vairous methods.
        """
        statics = []
        self._crawl_proxies_sites(statics=statics)
        logging.info('Having add %d proxies' % (len(statics), ))

    def _crawl_proxies_sites(self, statics=[]):
        """Get proxies from web pages."""
        args = ((url, val['rules'], val['proxies'], statics)
                for url, val in self.configs['PROXY_SITES'].iteritems())
        self.pool.map(self._crawl_proxies_one_site, args)

    def _crawl_proxies_one_site(self, args):
        """Get proxies (ip:port) from url and then write them into redis."""
        url = args[0]
        rules = args[1]
        proxies = args[2]
        headers = self.configs['HEADERS']
        headers['Host'] = url.split('/', 3)[2]
        logging.info('Begin crawl page %s' % (url, ))
        res = requests.get(url, headers=headers, proxies=proxies)
        encoding = res.encoding
        html = etree.HTML(res.content)
        proxies = []
        len_rules = len(rules)

        nodes = html.xpath(rules[0])
        if nodes:
            if len_rules == 1:
                for node in nodes:
                    text = str(node.text).encode(encoding).strip()
                    if text:
                        proxies.append('http://%s' % (text, ))
            elif len_rules == 2:
                rule_1 = rules[1].split(',')
                for node in nodes:
                    node = node.xpath(rule_1[0])
                    ip = str(node[1].text).encode(encoding).strip()
                    port = str(node[2].text).encode(encoding).strip() or '80'
                    if ip:
                        proxies.append('http://%s:%s' % (ip, port))

        for proxy in proxies:
            logging.info('Get proxy %s from %s' % (proxy, url))
            args[3].append(proxy)
            self.rdb.zadd(self.db_zproxy, self.init_value, proxy)

    def validate_proxies(self):
        """Validate whether the proxies are alive."""
        maxscore = self.init_value + self.wrong_value
        proxies = self.rdb.zrange(self.db_zproxy, 0, maxscore)
        statics_errors = []
        args = [(proxy, statics_errors) for proxy in proxies]
        self.group.map(self._validate_one_proxy, args)

        logging.info('Have validated %d proxies, %d errors happened.' %
                     (len(proxies), len(statics_errors)))

    def _validate_one_proxy(self, args):
        """Validate whether the proxy is still alive."""
        test_sites = self.configs['TEST_SITES']
        proxy = args[0]
        time_res = []
        args = [(site, proxy, time_res, args[1]) for site in test_sites]
        self.group.map(self._test_one_site, args)

        mean_time = sum(time_res) / len(test_sites)
        logging.info('Validating %s, score is %d' % (proxy, mean_time))
        self.rdb.zadd(self.db_zproxy, mean_time, proxy)

    def _test_one_site(self, args):
        url = args[0]
        headers = self.configs['HEADERS']
        headers['Host'] = url.split('/', 3)[2]
        proxy = args[1]
        proxies = {
            'http': proxy,
        }
        start_time = time.time()
        timeout = self.configs['TEST_TIMEOUT']
        error = False
        with Timeout(timeout, False):
            try:
                res = requests.get(url, headers=headers, proxies=proxies)
            except Exception as e:
                error = True
                logging.error('%s: Error: %s' % (proxy, e.message))

        if error:
            args[2].append(self.configs['WRONG_VALUE'])
            args[3].append(proxy)
        else:
            args[2].append(time.time() - start_time)
 def remove_fileno_resources(self,fileno):
     m = [(getattr(self,n),fileno) for n in store]
     group = Group()
     group.map(clean_dict,m)
     del m[:]
     del m
Esempio n. 27
0
 def filter_available_proxies(self, proxies):
     _p = Group()
     results = _p.map(self._crawler.test_proxy, proxies)
     return [proxy for result, proxy in zip(results, proxies) if result]