def process_request_origin(self, request, spider): redis = RedisQueue('proxy_ip') if not redis.empty(): proxy_ip = redis.get() else: proxy_ip = get_ip() proxy_para = {'ip_port': proxy_ip, 'user_pass': ''} request.meta['proxy'] = "http://%s" % proxy_para['ip_port'] if proxy_para['user_pass'] is not None: encoded_user_pass = base64.encodestring(proxy_para['user_pass']) request.headers[ 'Proxy-Authorization'] = 'Basic ' + encoded_user_pass print "*********************** RedisProxyMiddleware Using proxy ip: %s *****" % proxy_para[ 'ip_port'] redis.put(proxy_ip)
def process_request_origin(self, request, spider): redis = RedisQueue('proxy_ip') if not redis.empty(): proxy_ip = redis.get() else: proxy_ip = get_ip() proxy_para = { 'ip_port': proxy_ip, 'user_pass': '' } request.meta['proxy'] = "http://%s" % proxy_para['ip_port'] if proxy_para['user_pass'] is not None: encoded_user_pass = base64.encodestring(proxy_para['user_pass']) request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass print "*********************** RedisProxyMiddleware Using proxy ip: %s *****" % proxy_para['ip_port'] redis.put(proxy_ip)
def process_exception(self, request, exception, spider): request_ip = request.meta['proxy'] invalid_ip = request_ip.split('//')[1] redis = RedisQueue('proxy_ip') redis_invalid_ip = RedisQueue('invalid_ip') if not redis.empty(): redis.remove(invalid_ip) redis_invalid_ip.put(invalid_ip) print '+++++++++++++++++++++++%s' % exception print '-----------------------removing ip from redis: %s' % invalid_ip new_ip = get_ip() proxy_para = {'ip_port': new_ip, 'user_pass': ''} request.meta['proxy'] = "http://%s" % proxy_para['ip_port'] if proxy_para['user_pass'] is not None: encoded_user_pass = base64.encodestring(proxy_para['user_pass']) request.headers[ 'Proxy-Authorization'] = 'Basic ' + encoded_user_pass print ">>>>>>>>>>>>>>>>>>>>>>>>>>> switch %s to ip: %s *****" % ( invalid_ip, proxy_para['ip_port']) redis.put(new_ip)
def process_exception(self, request, exception, spider): request_ip = request.meta['proxy'] invalid_ip = request_ip.split('//')[1] redis = RedisQueue('proxy_ip') redis_invalid_ip = RedisQueue('invalid_ip') if not redis.empty(): redis.remove(invalid_ip) redis_invalid_ip.put(invalid_ip) print '+++++++++++++++++++++++%s' %exception print '-----------------------removing ip from redis: %s' %invalid_ip new_ip = get_ip() proxy_para = { 'ip_port': new_ip, 'user_pass': '' } request.meta['proxy'] = "http://%s" % proxy_para['ip_port'] if proxy_para['user_pass'] is not None: encoded_user_pass = base64.encodestring(proxy_para['user_pass']) request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass print ">>>>>>>>>>>>>>>>>>>>>>>>>>> switch %s to ip: %s *****" % (invalid_ip,proxy_para['ip_port']) redis.put(new_ip)
}) except Exception as e: print_err(e) print_err(uname) print_err("Something wrong when try to get user's followed topics") time.sleep(random.uniform(0, 5)) return user_questions if __name__ == '__main__': q = RedisQueue('follow_question_queue') sleep_time = 0 db = MongoClient().zhihu.zhihu_follow_questions while 1: if (q.empty()): print('Finished at %s' % str(datetime.datetime.now())) print('Waiting ...') uname = q.get() uname = uname.decode() if db.find({'_id': uname}).count() > 0: continue try: with timeout(seconds=40): all_questions = get_user_questions(uname) if all_questions == {}: continue elif all_questions is None: sleep_time += random.uniform(1, 5) print_err('Sleeping for %0.2f seconds' % sleep_time)
q = RedisQueue('account_login', **redis_conn) http = urllib3.PoolManager(num_pools=50) def worker(value): params = {} params['account_login'] = base64.encodestring(value) r = http.request('POST', author_login, params) #服务器失败,重新压回队列 if r.status != 200: q.put(value) #IP白名单验证失败,重新压回队列 if r.data['status'] == 10002: q.put(value) print r.data while 1: # time.sleep(1); if q.empty(): print 'empty queue' break s = q.qsize() for i in range(0,s): value = q.get() t = threading.Thread(target=worker, args=(value,)) t.start() if threading.active_count() >= 500: time.sleep(1)
from RedisQueue import RedisQueue redis = RedisQueue('0', 'jandan') def user_agent(url): proxy_handler = urllib2.ProxyHandler({'http': '127.0.0.1:8080'}) opener = urllib2.build_opener(proxy_handler) urllib2.install_opener(opener) req_header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0' } req_timeout = 20 req = urllib2.Request(url, None, req_header) page = urllib2.urlopen(req, None, req_timeout) html = page return html while True: while not redis.empty(): down_url = redis.get() print(down_url) try: data = user_agent(down_url).read() with open('./' + down_url[-11:], 'wb') as code: code.write(data) redis.pop() except: pass
#!/usr/bin/env python # UniPi Python Control Panel # clean_redis_queue_2.py # uses Python 3.5 up # Author: Johannes Untiedt # Version 10.0 vom 26.03.2018 import time from RedisQueue import RedisQueue if __name__ == '__main__': print("clean_redis_queue_2 started") q = RedisQueue('ws_2') while not q.empty(): message = q.get() print(message) print("redis_queue_2 cleaned")
import urllib2 from RedisQueue import RedisQueue redis = RedisQueue('jandan3') def user_agent(url): req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'} req_timeout = 20 req = urllib2.Request(url,None,req_header) page = urllib2.urlopen(req,None,req_timeout) html = page return html while not redis.empty(): down_url = redis.get() data = user_agent(down_url).read() with open('D:/Python/picture'+'/'+down_url[-11:],'wb')as code: code.write(data) print down_url
class machine(multiprocessing.Process): def __init__(self, df): multiprocessing.Process.__init__(self) self.df = df def run(self): sleep(10) self.last_mean = .015 self.q = RedisQueue('test') print('start') self.conn = sqlite3.connect("data.db") while not self.q.empty(): features = str(self.q.get())[3:-2].replace("'","").split(', ') self.features = list(features) for self.hold_time in ['_10']: df = self.df[self.features+['stock_perc_change'+self.hold_time, 'abnormal_perc_change'+self.hold_time]] targets = [self.df['stock_perc_change'+self.hold_time], self.df['abnormal_perc_change'+self.hold_time]] positive_dfs = [] negative_dfs = [] for i in range(8): a_train, a_test, b_train, b_test = train_test_split(df.ix[:,:-2], df.ix[:,-2:], test_size=.4) self.train(a_train, b_train) test_result, negative_df, positive_df = self.test(a_test, b_test) if test_result: positive_dfs.append(positive_df) negative_dfs.append(negative_df) else: break if test_result: self.get_result(pd.concat(positive_dfs), pd.concat(negative_dfs)) def train(self, a_train, b_train): self.clf = SVR(C=1.0, epsilon=0.2) self.clf.fit(a_train, b_train['abnormal_perc_change'+self.hold_time]) def test(self, a_test, b_test): a_test['Predicted'] = self.clf.predict(a_test) a_test['Actual_stock_perc_change'+self.hold_time] = b_test['stock_perc_change'+self.hold_time] a_test['Actual_abnormal_perc_change'+self.hold_time] = b_test['abnormal_perc_change'+self.hold_time] if len(a_test['Predicted'].unique())<40: return False, None, None a_test = a_test.sort_values(by='Predicted') return True, a_test.ix[:,-3:].head(20), a_test.ix[:,-3:].tail(20) def get_result(self, df_p, df_n): p_result = df_p.describe() n_result = df_n.describe() if p_result.ix['mean','Actual_abnormal_perc_change_10']<0 or n_result.ix['mean','Actual_abnormal_perc_change_10']>0: return if p_result.ix['50%','Actual_abnormal_perc_change_10']<0 or n_result.ix['50%','Actual_abnormal_perc_change_10']>0: return store_me = False if p_result.ix['mean','Actual_abnormal_perc_change_10']>self.last_mean: self.last_mean = p_result.ix['mean','Actual_abnormal_perc_change_10'] store_me = True p_result.index = p_result.index+'_pos' n_result.index = n_result.index+'_neg' p_result = p_result.stack().reset_index() p_result.index = p_result['level_1'] +'-'+ p_result['level_0'] p_result = p_result[0] n_result = n_result.stack().reset_index() n_result.index = n_result['level_1'] +'-'+ n_result['level_0'] n_result = n_result[0] result = p_result.append(n_result) result = pd.DataFrame(result).T self.model_name = str(self.features)[1:-1]+'__'+self.hold_time[1:] result['features'] = self.model_name if store_me: result.to_sql('results', self.conn, index = False, if_exists='append') self.store_machine() def store_machine(self): df = self.df[self.features] target = self.df['abnormal_perc_change'+self.hold_time] self.clf = SVR(C=1.0, epsilon=0.2) self.clf.fit(df, target) from sklearn.externals import joblib joblib.dump(self.clf, 'machines/'+self.model_name)
try: ip, port = host.strip().split(":") proxies = dict(http='socks5://' + host, https='socks5://' + host) timeout = 1.0 resp = requests.get('http://2017.ip138.com/ic.asp', proxies=proxies, timeout=timeout) r = resp.text.encode('UTF-8') ip = re.findall('\[(\d*?\.\d*?\.\d*?\.\d*?)\]', r)[0] if ip: rWrite.put(host) print "############" + host + "############" except: pass while True: while not rRead.empty(): sProxy = rRead.pop() lProxy.append(sProxy) nCounter = nCounter + 1 if nCounter % 100 == 0: lRequests = threadpool.makeRequests(check, lProxy) [pool.putRequest(req) for req in lRequests] pool.wait() lProxy = [] if len(lProxy) != 0: lRequests = threadpool.makeRequests(check, lProxy) [pool.putRequest(req) for req in lRequests] pool.wait() lProxy = []
class Spider(object): def __init__(self, url='http://weixin.sogou.com/weixin', key_word=None): self.url = url self.key_word = key_word self.headers = { 'Accept': 'text/html,application/xhtml+xml.application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,en,q-0.6,ja;q=0.4,zs-TW;q=0.2,mt;q=0.2', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', } self.session = Session() self.queue = RedisQueue() self.mysql = MySQL() def start(self): ''' 初始化工作 ''' self.session.headers.update(self.headers) start_url = self.url+'?'+urlencode({'query': self.key_word, 'type':2}) weixin_request = WeiXinRequest(url=start_url, callback=self.parse_index, need_proxy=True) self.queue.pusp(weixin_request) def parse_index(self, response): ''' 页面解析 ''' doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeiXinRequest(url=url, callback=self.parse_detail, headers=self.headers) yield weixin_request # 下一页链接 next = doc('#sogou_next').attr('href') if next: url = self.url+str(next) weixin_request = WeiXinRequest(url=url, callback=self.parse_index, headers=self.headers) yield weixin_request def parse_detail(self, response): ''' 解析详情 ''' doc(response.text) data = { 'title': doc('.rich_media_title').text(), 'content': doc('rich_medai_content').text(), 'data': doc('#post-date').text(), 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text(), } yield data def error(self, weixin_request: WeiXinRequest): weixin_request.fail_times += 1 print ('Request Faile: {}, Url is{}'.format(weixin_request.fail_times, weixin_request.url)) if weixin_request.fail_times < MAX_FAIL_TIME: self.queue.pusp(weixin_request) def get_proxy(self, url='localhsot:5000/random'): ''' 在自己的ip池中获取可用的代理ip ''' try: resp = requests.get(url) if resp.status_code == 200: return resp.text return None except Exception as e: return None def schedule(self): ''' 调度策略 ''' while not self.queue.empty(): # 队列不为空 weixin_request = self.queue.pop() callback = weixin_request.callback print ('Schedule: {}'.format(weixin_request.url)) response = self.request(weixin_request) if response and response.status_code == 200: results = list(callback(response)) # gererator if results: for result in results: if isinstance(result, WeiXinRequest): self.queue.pusp() if isinstance(result, dict): self.mysql.insert(result) else: self.error(weixin_request) else: self.error(weixin_request) def request(self, weixin_request: WeiXinRequest): ''' 执行请求 ''' try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http': proxy, 'https': proxy, } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False) except (ConnectionError, ReadTimeout): return False def run(self): self.start() self.schedule()
class Spider(object): def __init__(self, url='http://weixin.sogou.com/weixin', key_word=None): self.url = url self.key_word = key_word self.headers = { 'Accept': 'text/html,application/xhtml+xml.application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,en,q-0.6,ja;q=0.4,zs-TW;q=0.2,mt;q=0.2', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', } self.session = Session() self.queue = RedisQueue() self.mysql = MySQL() def start(self): ''' 初始化工作 ''' self.session.headers.update(self.headers) start_url = self.url + '?' + urlencode({ 'query': self.key_word, 'type': 2 }) weixin_request = WeiXinRequest(url=start_url, callback=self.parse_index, need_proxy=True) self.queue.pusp(weixin_request) def parse_index(self, response): ''' 页面解析 ''' doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeiXinRequest(url=url, callback=self.parse_detail, headers=self.headers) yield weixin_request # 下一页链接 next = doc('#sogou_next').attr('href') if next: url = self.url + str(next) weixin_request = WeiXinRequest(url=url, callback=self.parse_index, headers=self.headers) yield weixin_request def parse_detail(self, response): ''' 解析详情 ''' doc(response.text) data = { 'title': doc('.rich_media_title').text(), 'content': doc('rich_medai_content').text(), 'data': doc('#post-date').text(), 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text(), } yield data def error(self, weixin_request: WeiXinRequest): weixin_request.fail_times += 1 print('Request Faile: {}, Url is{}'.format(weixin_request.fail_times, weixin_request.url)) if weixin_request.fail_times < MAX_FAIL_TIME: self.queue.pusp(weixin_request) def get_proxy(self, url='localhsot:5000/random'): ''' 在自己的ip池中获取可用的代理ip ''' try: resp = requests.get(url) if resp.status_code == 200: return resp.text return None except Exception as e: return None def schedule(self): ''' 调度策略 ''' while not self.queue.empty(): # 队列不为空 weixin_request = self.queue.pop() callback = weixin_request.callback print('Schedule: {}'.format(weixin_request.url)) response = self.request(weixin_request) if response and response.status_code == 200: results = list(callback(response)) # gererator if results: for result in results: if isinstance(result, WeiXinRequest): self.queue.pusp() if isinstance(result, dict): self.mysql.insert(result) else: self.error(weixin_request) else: self.error(weixin_request) def request(self, weixin_request: WeiXinRequest): ''' 执行请求 ''' try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http': proxy, 'https': proxy, } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False) except (ConnectionError, ReadTimeout): return False def run(self): self.start() self.schedule()