Ejemplo n.º 1
0
    def test_rate_limit(self):
        from tweetf0rm.proxies import proxy_checker

        proxy_list = proxy_checker(self.proxies['proxies'])

        ps = []
        for i, twitter_user in enumerate(self.config['apikeys']):
            apikeys = self.config['apikeys'][twitter_user]

            client_args = {
                "timeout": 300,
                "proxies": {
                    'http': '203.156.207.249:8080'
                }  #proxy_list[i]['proxy_dict']
            }
            logger.info(client_args)

            p = mp.Process(target=call_user_api, args=(
                apikeys,
                client_args,
            ))
            ps.append(p)
            p.start()

        for p in ps:
            p.join()
Ejemplo n.º 2
0
    def test_rate_limit(self):
        from tweetf0rm.proxies import proxy_checker

        proxy_list = proxy_checker(self.proxies['proxies'])

        ps = []
        for i, twitter_user in enumerate(self.config['apikeys']):
            apikeys = self.config['apikeys'][twitter_user]

            client_args = {
                "timeout": 300,
                "proxies": {'http': '203.156.207.249:8080'}  # proxy_list[i]['proxy_dict']
            }
            logger.info(client_args)

            p = mp.Process(target=call_user_api, args=(apikeys, client_args,))
            ps.append(p)
            p.start()

        for p in ps:
            p.join()
Ejemplo n.º 3
0
    def __init__(self, node_id, config={}, proxies=[]):
        self.node_id = node_id
        self.config = config
        if (proxies and len(proxies) > 0):

            self.proxy_list = proxy_checker(proxies)

            logger.info("number of live proxies: %d" % (len(self.proxy_list)))

            # each process only get one apikey...  if there are more proxies than apikeys, each process can get more than one proxy that can be rotated when one fails.
            number_of_processes = min(len(self.config['apikeys']),
                                      len(self.proxy_list))

            # if there are more proxies than apikeys, then each process will get a list of proxies, and the process will restart it self if a proxy failed, and try the next available proxy
            self.proxy_generator = self.split(self.proxy_list,
                                              number_of_processes)

        else:
            self.proxy_list = None
            self.proxy_generator = None
            number_of_processes = 1

        logger.info("number of crawlers: %d" % (number_of_processes))

        apikey_list = self.config['apikeys'].keys()

        self.crawlers = {}
        for idx in range(number_of_processes):
            try:
                self.new_crawler(self.node_id,
                                 self.config['apikeys'][apikey_list[idx]],
                                 config)
            except Exception as exc:
                logger.error(exc)
                pass

        self.node_coordinator = NodeCoordinator(config['redis_config'])
        self.node_coordinator.add_node(node_id)

        logger.info("number of crawlers: %d created" % (number_of_processes))
Ejemplo n.º 4
0
	def __init__(self, node_id, config={}, proxies=[]):
		self.node_id = node_id
		self.config = config
		if (len(proxies) > 0):
			
			self.proxy_list = proxy_checker(proxies)

			logger.info("number of live proxies: %d"%(len(self.proxy_list)))

			# each process only get one apikey...  if there are more proxies than apikeys, each process can get more than one proxy that can be rotated when one fails. 
			number_of_processes = min(len(self.config['apikeys']), len(self.proxy_list))

			# if there are more proxies than apikeys, then each process will get a list of proxies, and the process will restart it self if a proxy failed, and try the next available proxy
			self.proxy_generator = self.split(self.proxy_list, number_of_processes)

		else:
			self.proxy_list = None
			self.proxy_generator = None
			number_of_processes = 1

		logger.info("number of crawlers: %d"%(number_of_processes))

		apikey_list = self.config['apikeys'].keys()


		self.crawlers = {}
		for idx in range(number_of_processes):
			try:
				self.new_crawler(self.node_id, self.config['apikeys'][apikey_list[idx]], config)
			except Exception as exc:
				logger.error(exc)
				pass


		self.node_coordinator = NodeCoordinator(config['redis_config'])
		self.node_coordinator.add_node(node_id)

		logger.info("number of crawlers: %d created"%(number_of_processes))
Ejemplo n.º 5
0
	def test_proxy(self):
		proxies = proxy_checker(self.proxies['proxies'])
		#logger.info(proxies)
		logger.info('%d good proxies left'%len(proxies))
Ejemplo n.º 6
0
    urls = re.findall(r'<a href=\'(/en/http-proxy-list/\d+/.*?)\'>', html)

    urls = set(urls)

    urls.add('/en/http-proxy-list/')

    proxies = []
    for url in urls:
        proxies.extend(crawl_spys_ru(url))

    # check if there is a proxies.json locally, merge the check results rather than overwrite it
    if (os.path.exists(os.path.abspath(args.output))):
        with open(os.path.abspath(args.output), 'rb') as proxy_f:
            proxies.extend(json.load(proxy_f)['proxies'])

    ips = []
    proxy_list = []
    for proxy in proxies:
        ip = proxy.keys()[0]
        proxy_type = proxy.values()[0]

        if (ip not in ips):
            ips.append(ip)
            proxy_list.append({ip: proxy_type})

    proxies = [p['proxy'] for p in proxy_checker(proxy_list)]

    logger.info("number of proxies that are still alive: %d" % len(proxies))
    with open(os.path.abspath(args.output), 'wb') as proxy_f:
        json.dump({'proxies': proxies}, proxy_f)
Ejemplo n.º 7
0
	for i in range(5):
		proxies.extend(crawl_spys_ru(i))

	# check if there is a proxies.json locally, merge the check results rather than overwrite it
	if (os.path.exists(os.path.abspath(args.output))):
		with open(os.path.abspath(args.output), 'rb') as proxy_f:
			proxies.extend(json.load(proxy_f)['proxies'])


	ips = []
	proxy_list = []
	for proxy in proxies:
		ip = proxy.keys()[0]
		proxy_type = proxy.values()[0]

		if (ip not in ips):
			ips.append(ip)
			proxy_list.append({ip: proxy_type})


	proxies = [p['proxy'] for p in proxy_checker(proxy_list)]

	logger.info("number of proxies that are still alive: %d"%len(proxies))
	with open(os.path.abspath(args.output), 'wb') as proxy_f:
		json.dump({'proxies':proxies}, proxy_f)
	

			


Ejemplo n.º 8
0
    level=logging.INFO,
    format=
    '%(levelname)s-[%(asctime)s][%(module)s][%(funcName)s][%(lineno)d]: %(message)s'
)
requests_log = logging.getLogger("requests")
requests_log.setLevel(logging.WARNING)

import argparse, pickle, os, json, sys, time
sys.path.append("..")

from tweetf0rm.proxies import proxy_checker

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument('-p',
                        '--proxies',
                        help="define the location of the output;",
                        default="proxies.json")
    args = parser.parse_args()

    with open(os.path.abspath(args.proxies), 'rb') as proxy_f:
        proxies = json.load(proxy_f)['proxies']

        proxies = [proxy['proxy'] for proxy in proxy_checker(proxies)]

        logger.info('%d live proxies left' % (len(proxies)))

        with open(os.path.abspath(args.proxies), 'wb') as proxy_f:
            json.dump({'proxies': proxies}, proxy_f)
Ejemplo n.º 9
0
logging.basicConfig(level=logging.INFO, format='%(levelname)s-[%(asctime)s][%(module)s][%(funcName)s][%(lineno)d]: %(message)s')
requests_log = logging.getLogger("requests")
requests_log.setLevel(logging.WARNING)

import argparse, pickle, os, json, sys, time
sys.path.append("..")


from tweetf0rm.proxies import proxy_checker

if __name__=="__main__":

	parser = argparse.ArgumentParser()
	parser.add_argument('-p', '--proxies', help="define the location of the output;", default="proxies.json")
	args = parser.parse_args()
	
	with open(os.path.abspath(args.proxies), 'rb') as proxy_f:
		proxies = json.load(proxy_f)['proxies']
		
		proxies = [proxy['proxy'] for proxy in proxy_checker(proxies)]

		logger.info('%d live proxies left'%(len(proxies)))

		with open(os.path.abspath(args.proxies), 'wb') as proxy_f:
			json.dump({'proxies':proxies}, proxy_f)
	

			


Ejemplo n.º 10
0
 def test_proxy(self):
     proxies = proxy_checker(self.proxies['proxies'])
     #logger.info(proxies)
     logger.info('%d good proxies left' % len(proxies))