def get_valid_proxy(target_url, ip_set, referer='https://www.google.com'): """extract a valid proxy for target_url from redis Args: target_url (str): url that need to visite with a proxy ip_set (str): the set in redis that stores proxies referer (str, optional): referer to construct headers for testing whether proxy is valid Returns: curr_proxy(str): a valid proxy in the format of ip:port """ try: conn = get_connection() proxies = conn.srandmember(ip_set, 5) curr_proxy = proxies.pop() # if proxy is not valid, delete it from redis while not is_valid(target_url, curr_proxy, referer): conn.srem(ip_set, curr_proxy) if len(proxies) == 0: proxies = conn.srandmember(ip_set, 5) curr_proxy = proxies.pop() return curr_proxy except Exception, e: print 'Error while getting proxy from redis\n%s' % e.message sys.exit(0)
def get_proxies(proxy_type, ip_set, start_page, end_page): """extract proxies from page source code, store them in redis Args: proxy_type (str): base url for proxy type, like the global variables CHINA and OTHER ip_set (str): which set should the ips be stored in redis start_page (int): which page to start crawling end_page (int): which page to stop crawling """ try: conn = get_connection() except Exception: print 'Error while connecting to redis' return proxies, curr_proxy = [], None for page in xrange(start_page, end_page + 1): if page % 2 == 0: time.sleep(20) # get page source code headers = { 'user-agent': generate_user_agent(), 'referer': 'http://www.xicidaili.com/' } text = requests.get(proxy_type + str(page), headers=headers).text # extract ips from source code soup = BeautifulSoup(text, 'lxml') for tr in soup.find_all('tr')[1:]: tds = tr.find_all('td') #if u'美国' in tds[3].text: proxy = tds[1].text + ':' + tds[2].text if is_valid('https://www.amazon.com/', proxy): conn.sadd(ip_set, proxy) print '%s added to ip set %s' % (proxy, ip_set)
def get_valid_proxy(target_url, ip_set, referer = 'https://www.google.com'): """extract a valid proxy for target_url from redis Args: target_url (str): url that need to visite with a proxy ip_set (str): the set in redis that stores proxies referer (str, optional): referer to construct headers for testing whether proxy is valid Returns: curr_proxy(str): a valid proxy in the format of ip:port """ try: conn = get_connection() proxies = conn.srandmember(ip_set, 5) curr_proxy = proxies.pop() # if proxy is not valid, delete it from redis while not is_valid(target_url, curr_proxy, referer): conn.srem(ip_set, curr_proxy) if len(proxies) == 0: proxies = conn.srandmember(ip_set, 5) curr_proxy = proxies.pop() return curr_proxy except Exception, e: print 'Error while getting proxy from redis\n%s'%e.message sys.exit(0)
def get_proxies(proxy_type, ip_set, start_page, end_page): """extract proxies from page source code, store them in redis Args: proxy_type (str): base url for proxy type, like the global variables CHINA and OTHER ip_set (str): which set should the ips be stored in redis start_page (int): which page to start crawling end_page (int): which page to stop crawling """ try: conn = get_connection() except Exception: print 'Error while connecting to redis' return proxies, curr_proxy =[], None for page in xrange(start_page, end_page+1): if page % 2 == 0: time.sleep(20) # get page source code headers = {'user-agent': generate_user_agent(), 'referer': 'http://www.xicidaili.com/'} text = requests.get(proxy_type+str(page), headers = headers).text # extract ips from source code soup = BeautifulSoup(text, 'lxml') for tr in soup.find_all('tr')[1:]: tds = tr.find_all('td') #if u'美国' in tds[3].text: proxy = tds[1].text+':'+tds[2].text if is_valid('https://www.amazon.com/', proxy): conn.sadd(ip_set, proxy) print '%s added to ip set %s' %(proxy, ip_set)
def get_user_names(): r = get_connection(DB=1) with open('names') as f: for line in f: # print line.strip().title() r.sadd('user_name', line.strip().title())
name_visa = info[0]['value'] + '#' + info[11]['value'] + '#' + info[13][ 'value'] print name_phone, name_visa return name_phone, name_visa def get_user_names(): r = get_connection(DB=1) with open('names') as f: for line in f: # print line.strip().title() r.sadd('user_name', line.strip().title()) if __name__ == '__main__': r = get_connection(DB=3) crawl_address, crawl_phone_visa = True, False if crawl_address: count = 0 while True: if count % 10 == 0: proxy = get_valid_proxy( 'https://fakena.me/random-real-address/', 'china_ips', referer=r'https://fakena.me') print 'current proxy: %s' % proxy addr = get_address(proxy) if addr: r.sadd('address', addr) print 'successfully add address %s to redis' % addr count += 1
name_phone = info[0]['value']+'#'+info[9]['value'] name_visa = info[0]['value']+'#'+info[11]['value']+'#'+info[13]['value'] print name_phone, name_visa return name_phone, name_visa def get_user_names(): r = get_connection(DB=1) with open('names') as f: for line in f: # print line.strip().title() r.sadd('user_name', line.strip().title()) if __name__ == '__main__': r = get_connection(DB = 3) crawl_address, crawl_phone_visa = True, False if crawl_address: count = 0 while True: if count % 10 == 0: proxy = get_valid_proxy('https://fakena.me/random-real-address/', 'china_ips', referer = r'https://fakena.me') print 'current proxy: %s'%proxy addr = get_address(proxy) if addr: r.sadd('address', addr) print 'successfully add address %s to redis'%addr count += 1 time.sleep(5) elif crawl_phone_visa: while True: