def mtgox_websocket_connect(s, proxytype=None,proxy=None): host='websocket.mtgox.com' port=80 if proxytype=='http': s.connect(proxy) Proxies.do_http_connect(s,host,port) elif proxytype=='socks4a': s.connect(proxy) Proxies.do_socks4a(s,host,port) else: s.connect((host,port)) querylines = [ 'GET /mtgox HTTP/1.1', 'Upgrade: WebSocket', 'Connection: Upgrade', 'Host: websocket.mtgox.com', 'Origin: null', ] s.send('\r\n'.join(querylines)+'\r\n\r\n') lines = [] while True: line = recv_line(s) if line=='': break lines.append(line) if lines[0]<>'HTTP/1.1 101 Web Socket Protocol Handshake': raise Exception('server did not say "101" (said %s)'%repr(lines[0])) return s
def do_https_query(url, postdata=None, reqheaders=None, digest=None, digesttype='sha256', proxy=None, proxytype=None, timeout=None): scheme,netloc,path,query,fragment = urlparse.urlsplit(url) if scheme<>'https': raise Exception('not an HTTPS query') if len(netloc.split('@'))>2: raise Exception('username/password not supported') hp = netloc.split(':') if len(hp)>2 or len(hp)<1: raise Exception('netloc must be hostname:port') host = hp[0] if len(hp)>=2: port = int(hp[1]) else: port = 443 if timeout<>None: signal.alarm(timeout) s = socket.socket() if proxytype<>None: s.connect(proxy) Proxies.do_proxy_connect(s,proxytype,host,port) else: s.connect((host,port)) ctx = OpenSSL.SSL.Context(OpenSSL.SSL.TLSv1_METHOD) c = OpenSSL.SSL.Connection(ctx,s) c.set_connect_state() c.do_handshake() if timeout<>None: signal.alarm(0) if digest<>None: cert = c.get_peer_certificate() digestgot = cert.digest(digesttype) if digestgot<>digest: raise Exception('server certificate mismatch (%s)'%digestgot) result = do_http_query(c, host, path+query, postdata, headers=reqheaders) c.close() return result
def crawl_job(self): count = 0 now = datetime.now() current_time = now.strftime("%H:%M %d-%m-%Y") print('\n====== Starting Crawling at : ' + str(current_time) + ' ==============================\n') count_prox = 1 if self.num_prox > 0: proxy = prox.Proxies(number_of_proxies=1).getProxiesAllInOne()[0] for url in self.stations_url: # Get name name = url.split('stations')[-1].replace('/', '') # Get Proxy { ip: xxxx , port: xxxxx} if self.num_prox > 0 and count_prox % self.num_prox == 0: print("Searching for new proxy") proxy = prox.Proxies( number_of_proxies=1).getProxiesAllInOne()[0] count_prox += 1 # create Instance for crawl cs = cS.Crawl_Station(url, proxy=proxy) # Get Data data = cs.getInfo() if len(data) >= 6: count += 1 #print(data) data['TimeCrawled'] = now.strftime("%H:%M") self.writeCSV(name, data) #time.sleep(5) now = datetime.now() current_time = now.strftime("%H:%M %d-%m-%Y") print('\n====== Crawling Ended at ' + current_time + ' ==============================\n') self.writeLogFile(current_time, count)
"Cookie": "_ga=GA1.2.841469794.1541152606; user_trace_token=20181102175657-a2701865-de85-11e8-8368-525400f775ce; LGUID=20181102175657-a2701fbd-de85-11e8-8368-525400f775ce; index_location_city=%E5%B9%BF%E5%B7%9E; _gid=GA1.2.311675459.1542615716; _ga=GA1.3.841469794.1541152606; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542634073,1542634080,1542634122,1542634128; JSESSIONID=ABAAABAAAGCABCC1B87E5C12282CECED77A736D4CD7FA8A; X_HTTP_TOKEN=aae2d9e96d6a68f72d98ab409a933460; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%2C%22%24device_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; LGSID=20181119231628-167f7db1-ec0e-11e8-a76a-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fm.lagou.com%2Fsearch.html; PRE_LAND=https%3A%2F%2Fm.lagou.com%2Fjobs%2F5219979.html; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6={timeStamp}; LGRID={time}-1c458fde-ec0e-11e8-895f-5254005c3644" .format(timeStamp=timeStamp, time=time1), "Referer": "https://m.lagou.com/search.html", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36", "X-Requested-With": "XMLHttpRequest", } city = "广州" positionName = "python" # pageNo = "1" pageSize = "15" proxies = Proxies() def get_detail_url(pageNo, proxies): base_url = "https://m.lagou.com/search.json?city={city}&positionName={positionName}&pageNo={pageNo}&" \ "pageSize={pageSize}".format(city=city,positionName=positionName,pageNo=pageNo,pageSize=pageSize) res = requests.get(base_url, headers=headers, proxies=proxies) content = res.content.decode() dict1 = json.loads(content) # print(dict1) list1 = dict1['content']['data']['page']['result'] for i in list1: yield "https://m.lagou.com/jobs/{}.html".format(i['positionId']) # 职位名称 薪资 工作地点 工作年限 学历要求 企业名字 职位描述