def check_out(proxies, check_header): while 1: proxie = proxies[random.randint(0, len(proxies) - 1)] if check_out_base(proxie, check_header): result = proxie break else: session.query(Ip_Pool).filter(Ip_Pool.ip == proxie).update( {Ip_Pool.datastatus: 2}) session.commit() continue return result
def download_data(url, referer_header, stock, proxies, check_header): while 1: proxies_down = check_out(proxies, check_header) if "https://" in proxies_down: proxie = {"https": proxies_down} elif "http://" in proxies_down: proxie = {"http": proxies_down} else: proxie = None try: response = requests.get(url, headers=referer_header, proxies=proxie, timeout=5) response.raise_for_status() status_code = response.status_code except requests.exceptions.RequestException as e: print e status_code = 400 response = None if status_code < 300 and response is not None: result = response.text.encode('utf-8') strresult = str(result) try: strJsonData = strresult[strresult.find('(') + 1:strresult.rfind(')')] dict_data = dict(json.loads(strJsonData)) except Exception as e: print e continue else: db = MySqlCon() data = {} bulletinid_list = [] sql = """INSERT INTO sh_a_share(bulletinid,stockcode,stockname, title,category,url,bulletinyear,bulletindate,uploadtime,datastatus) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" for da in dict_data["pageHelp"]["data"]: pdfurl = 'http://static.sse.com.cn' + da["URL"] bulletinid = hashlib.md5(pdfurl).hexdigest() bulletinid_list.append(bulletinid) data.update({ bulletinid: (bulletinid, da["security_Code"].encode('utf-8'), stock["stockname"], da["title"].encode('utf-8'), da["bulletin_Type"].encode('utf-8'), pdfurl.encode('utf-8'), da["bulletin_Year"].encode('utf-8'), da["SSEDate"].encode('utf-8'), str(datetime.now()), 1) }) # sql = "INSERT INTO sh_a_share(bulletinid,stockcode,stockname, " \ # "title,category,url,bulletinyear,bulletindate,uploadtime,datastatus) " \ # "VALUES('{}','{}','{}','{}','{}','{}','{}','{}','{}','{}') ".format(bulletinid, # i["security_Code"].encode( # 'utf-8'), # stock["stockname"], # i["title"].encode( # 'utf-8'), # i["bulletin_Type"].encode( # 'utf-8'), # pdfurl.encode('utf-8'), # i["bulletin_Year"].encode( # 'utf-8'), # i["SSEDate"].encode( # 'utf-8'), # str(datetime.now()), 1) # try: # db.cursor.execute(sql) # db.conn.commit() # except Exception as e: # db.conn.rollback() repeat = session.query(Sh_A_Share.bulletinid).filter( Sh_A_Share.bulletinid.in_(bulletinid_list)).all() for kk in repeat: data.pop(kk.bulletinid) if data: try: db.cursor.executemany(sql, data.values()) db.conn.commit() except Exception as e: print(e) db.conn.rollback() db.conn.close() break else: continue
def get_proxies(): proxies = session.query(Ip_Pool).filter(Ip_Pool.datastatus == 1).all() proxielist = [] for proxie in proxies: proxielist.append(proxie.ip) return proxielist
# logging.info(msg) # session.query(Sh_Share).filter(Sh_Share.stockcode == stock["stockcode"]).update({Sh_Share.datastatus: 2}) # session.commit() # time.sleep(5) stock = i print stock k = MyReptile(stock) start = time.time() for dd in k.page_urls: thread = myThread(urls=dd, proxies=k.proxies, check_header=k.check_header, referer_header=k.referer_header, stock=k.stock) thread.start() threads.append(thread) for t in threads: t.join() print 'down_success' end = time.time() msg = '股票代码:{},股票名称:{},耗时:{}s,日期:{}'.format(stock["stockcode"], stock["stockname"], end - start, datetime.now()) logging.info(msg) session.query(Sh_Share).filter( Sh_Share.stockcode == stock["stockcode"]).update( {Sh_Share.datastatus: 2}) session.commit() print '{}:end'.format(datetime.now())