def RUN(systemstyle): dcpoints_poc, revisepoints_poc, SQL_poc = None, None, None if SQL: if systemstyle in SQL: SQL_poc = SQL[systemstyle] if dcpoints: if systemstyle in dcpoints: dcpoints_poc = dcpoints[systemstyle] if revisepoints: if systemstyle in revisepoints: revisepoints_poc = revisepoints[systemstyle] con = OracleConn.Oracle_Connect(*SQLPWD[systemstyle], systemtag=systemstyle, log=log) data = con.PointAll(dcpoints_poc, revisepoints_poc, SQL_poc) dtxy_out = WebCheck.DTXY(systemstyle, log=log) dtxy_out.CheckWebStatus() pooloracle = ThreadPool(multi) poolgetdat = PoolGetDat(dtxy_out) oracle_res = pooloracle.map(poolgetdat, data) if str(type(pooloracle)).find('multiprocessing.pool.ThreadPool') > -1: pooloracle.close() pooloracle.join() filename = '{}.pickle'.format(systemstyle) if os.path.isdir(os.path.dirname(__file__)): thefilename = os.path.join(os.path.dirname(__file__), filename) else: thefilename = os.path.join(os.getcwd(), filename) with open(thefilename, 'wb') as f: pickle.dump(oracle_res, f) return oracle_res
def getgome(cat): for i in range(3): try: url = ''.join(('http://www.gome.com.cn/p/json?module=async_search¶mJson={%22pageNumber%22%3A', '1', '%2C%22envReq%22%3A{%22catId%22%3A%22', str( cat), '%22%2C%22regionId%22%3A%2231010100%22%2C%22et%22%3A%22%22%2C%22XSearch%22%3Afalse%2C%22pageNumber%22%3A1%2C%22pageSize%22%3A48}}')) r = requests.get(url) totalpage = int(r.json()['num']['totalPage']) urls = [''.join(('http://www.gome.com.cn/p/json?module=async_search¶mJson={%22pageNumber%22%3A', str(i), '%2C%22envReq%22%3A{%22catId%22%3A%22', str( cat), '%22%2C%22regionId%22%3A%2231010100%22%2C%22et%22%3A%22%22%2C%22XSearch%22%3Afalse%2C%22pageNumber%22%3A1%2C%22pageSize%22%3A48}}')) for i in xrange(1, totalpage + 1)] def ff(url): while 1: try: r = requests.get(url, timeout=3) return '\n'.join([i['pId'] for i in r.json()['products']]) except: continue pp = Pool(30) ss = pp.map(ff, urls) try: pp.close() pp.join() except: pass global jishu jishu += 1 sys.stderr.write(str(jishu) + ' / ' + zongshu + '\r') return '\n'.join(ss) + '\n' except: continue
def RUN(systemstyle): dcpoints_poc,revisepoints_poc,SQL_poc=None,None,None if SQL: if systemstyle in SQL: SQL_poc=SQL[systemstyle] if dcpoints: if systemstyle in dcpoints: dcpoints_poc=dcpoints[systemstyle] if revisepoints: if systemstyle in revisepoints: revisepoints_poc=revisepoints[systemstyle] con=OracleConn.Oracle_Connect(*SQLPWD[systemstyle],systemtag=systemstyle,log=log) data=con.PointAll(dcpoints_poc,revisepoints_poc,SQL_poc) dtxy_out=WebCheck.DTXY(systemstyle,log=log) dtxy_out.CheckWebStatus() pooloracle=ThreadPool(multi) poolgetdat=PoolGetDat(dtxy_out) oracle_res=pooloracle.map(poolgetdat,data) if str(type(pooloracle)).find('multiprocessing.pool.ThreadPool')>-1: pooloracle.close() pooloracle.join() filename='{}.pickle'.format(systemstyle) if os.path.isdir(os.path.dirname(__file__)): thefilename=os.path.join(os.path.dirname(__file__),filename) else: thefilename=os.path.join(os.getcwd(),filename) with open(thefilename,'wb') as f: pickle.dump(oracle_res,f) return oracle_res
def get_ids_by_cat(catid): while 1: try: url = 'http://list.taobao.com/itemlist/default.htm?_input_charset=utf-8&json=on&cat={0}&sort=biz30day&msp=1&as=1&viewIndex=1&atype=b&style=list&same_info=1&tid=0&isnew=2&pSize=96&data-key=s&data-value=1&data-action&module=page&s=0'.format( catid) r = requests.get(url, headers=headers, proxies=proxies, timeout=5) ss = r.text # print ss if '"itemList":null' in ss: return totalPage = int(re.findall('totalPage":"(\d+)"', ss)[0]) print 'start', catid, '=' * 50, '\ntotalPage', totalPage pagenums = range(1, totalPage + 1) pp = Pool(5) ss = pp.map(lambda x: get_taobao_ids(catid, x), pagenums) try: pp.close() pp.join() except: pass ss = '\n'.join(ss) + '\n' with open('./aa/' + str(catid) + '.txt', 'w') as f: f.write(ss) jishu = len(glob.glob('./aa/*.*')) print jishu, '/', zongshu, 'completed' return except Exception as e: print('retry zong', catid, e) continue
def getid(url1): while 1: try: url = url1 + 'all/----1--1---------.html' r = requests.get(url, headers=headers).text ss = re.findall('commid="(\d+)"', r) if not ss: global jishu jishu += 1 print('=' * 20, jishu, '/', zongshu, '=' * 20) return pn = fromstring(r).xpath( '//div[@class="sort_page_num"]/span/text()')[0].replace('/', '') if pn == '1': result = '\n'.join(ss) + '\n' else: pns = range(2, int(pn) + 1) pp = Pool(30) dd = pp.map(lambda x: getbypn(url1, x), pns) try: pp.close() pp.join() except: pass ss += dd result = '\n'.join(ss) + '\n' with open('all_id.txt', 'a') as f: f.write(result) global jishu jishu += 1 print('=' * 20, jishu, '/', zongshu, '=' * 20) return except: pass
def getid(url1): while 1: try: url = url1 + 'all/----1--1---------.html' r = requests.get(url, headers=headers).text ss = re.findall('commid="(\d+)"', r) if not ss: global jishu jishu += 1 print('=' * 20, jishu, '/', zongshu, '=' * 20) return pn = fromstring(r).xpath( '//div[@class="sort_page_num"]/span/text()')[0].replace( '/', '') if pn == '1': result = '\n'.join(ss) + '\n' else: pns = range(2, int(pn) + 1) pp = Pool(30) dd = pp.map(lambda x: getbypn(url1, x), pns) try: pp.close() pp.join() except: pass ss += dd result = '\n'.join(ss) + '\n' with open('all_id.txt', 'a') as f: f.write(result) global jishu jishu += 1 print('=' * 20, jishu, '/', zongshu, '=' * 20) return except: pass
def PoolWebCon(SQLPWD,SQL,dcpoints,revisepoints,log=None): systemstyles=list(SQLPWD.keys()) len_multi=len(systemstyles) len_submulti=int((psutil.cpu_count()-2)/(len_multi-1)) if len_submulti<4:len_submulti=4 poolweb=ThreadPool(len_submulti) pooloraclecon=PoolOracleCon(SQLPWD,SQL,dcpoints,revisepoints,len_submulti,log=log) web_res=poolweb.map(pooloraclecon,systemstyles) if str(type(poolweb)).find('multiprocessing.pool.ThreadPool')>-1: poolweb.close() poolweb.join() return web_res
def get_jd_rate_all(pid): maxpn = get_jd_rate_totalpagenum(pid) if maxpn == -1: # print('null') return pp = Pool(100) result = pp.map( lambda x: get_jd_rate(x[0], x[1]), list(zip([pid] * (maxpn + 1), range(maxpn + 1)))) try: pp.close() pp.join() except Exception, e: pass
def get_jd_rate_all(pid): maxpn = get_jd_rate_totalpagenum(pid) if maxpn == -1: # print('null') return pp = Pool(100) result = pp.map(lambda x: get_jd_rate(x[0], x[1]), list(zip([pid] * (maxpn + 1), range(maxpn + 1)))) try: pp.close() pp.join() except: pass return result
def get_ids_by_cat(catid): url = 'http://list.taobao.com/itemlist/default.htm?_input_charset=utf-8&json=on&cat={0}&sort=biz30day&msp=1&as=1&viewIndex=1&atype=b&style=list&same_info=1&tid=0&isnew=2&pSize=96&data-key=s&data-value=1&data-action&module=page&s=0'.format( catid) r = requests.get(url) ss = r.text totalPage = int(re.findall('totalPage":"(\d+)"', ss)[0]) pagenums = range(1, totalPage + 1) pp = Pool(5) ss = pp.map(lambda x: get_taobao_ids(catid, x), pagenums) try: pp.close() pp.join() except: pass return '\n'.join(ss)
def PoolWebCon(SQLPWD, SQL, dcpoints, revisepoints, log=None): systemstyles = list(SQLPWD.keys()) len_multi = len(systemstyles) len_submulti = int((psutil.cpu_count() - 2) / (len_multi - 1)) if len_submulti < 4: len_submulti = 4 poolweb = ThreadPool(len_submulti) pooloraclecon = PoolOracleCon(SQLPWD, SQL, dcpoints, revisepoints, len_submulti, log=log) web_res = poolweb.map(pooloraclecon, systemstyles) if str(type(poolweb)).find('multiprocessing.pool.ThreadPool') > -1: poolweb.close() poolweb.join() return web_res
def getid(url1): global jishu while 1: try: url = url1 r = requests.get(url, timeout=5).text ss = fromstring(r).xpath( '//div[@class="inner"]/p[@class="name"]/a/@href') ss = re.findall('dangdang\.com/(\d+)\.html', ''.join(ss)) if not ss: with open('finished.txt', 'a') as f: f.write(url1 + '\n') jishu += 1 print('=' * 20, jishu, '/', zongshu, '=' * 20) return pn = fromstring(r).xpath( '//div[@name="Fy"]/span[3]/text()|//div[@class="page"]/span[3]/text()' )[0].replace('/', '') if pn == '1': result = '\n'.join(ss) + '\n' else: pns = range(2, int(pn) + 1) pp = Pool(50) dd = pp.map(lambda x: getbypn(url1, x), pns) try: pp.close() pp.join() except: pass ss += dd result = '\n'.join(ss) + '\n' with open('all_id.txt', 'a') as f: f.write(result) with open('finished.txt', 'a') as f: f.write(url1 + '\n') jishu += 1 print('=' * 20, jishu, '/', zongshu, '=' * 20) return except Exception as e: print(url1, e) pass
def getid(url1): global jishu while 1: try: url = url1 r = requests.get(url, timeout=5).text ss = fromstring(r).xpath( '//div[@class="inner"]/p[@class="name"]/a/@href') ss = re.findall('dangdang\.com/(\d+)\.html', ''.join(ss)) if not ss: with open('finished.txt', 'a') as f: f.write(url1 + '\n') jishu += 1 print('=' * 20, jishu, '/', zongshu, '=' * 20) return pn = fromstring(r).xpath( '//div[@name="Fy"]/span[3]/text()|//div[@class="page"]/span[3]/text()')[0].replace('/', '') if pn == '1': result = '\n'.join(ss) + '\n' else: pns = range(2, int(pn) + 1) pp = Pool(50) dd = pp.map(lambda x: getbypn(url1, x), pns) try: pp.close() pp.join() except: pass ss += dd result = '\n'.join(ss) + '\n' with open('all_id.txt', 'a') as f: f.write(result) with open('finished.txt', 'a') as f: f.write(url1 + '\n') jishu += 1 print('=' * 20, jishu, '/', zongshu, '=' * 20) return except Exception as e: print(url1, e) pass
"http://facebook.com", "http://zhaduixueshe.com", "http://google.com", "http://hao123.com", "http://duoshuo.com", "http://v2ex.com", ] url_list = [] for i in range(1, 500): url_list.append("http://tieba.baidu.com/p/2781190586?pn=" + str(i)) pool = Pool(2) start = time.time() pool.map_async(create_mission, url_list) pool.close() pool.join() print "multiprocessing used ", str(time.time() - start) start = time.time() jobs = [] for url in url_list: jobs.append(gevent.spawn(create_mission, url)) gevent.joinall(jobs) print "use gevent used ", time.time() - start start = time.time() gevent_pool.map(create_mission, url_list) # pool.join()
except: pass ss += dd result = '\n'.join(ss) + '\n' with open('all_id.txt', 'a') as f: f.write(result) with open('finished.txt', 'a') as f: f.write(url1 + '\n') jishu += 1 print('=' * 20, jishu, '/', zongshu, '=' * 20) return except Exception as e: print(url1, e) pass with open('all_cat.txt') as f: all_cat = set([i.strip() for i in f.readlines()]) try: with open('finished.txt') as f: finish = set([i.strip() for i in f.readlines()]) except: finish = set() all_cat = all_cat - finish zongshu = len(all_cat) pp = Pool(200) pp.map(getid, all_cat) try: pp.close() pp.join() except: pass
print("%s开始执行,进程号为%d" % (msg, os.getpid())) # random.random() 随机生成0~1之间的浮点数 time.sleep(random.random() * 2) t_stop = time.time() print(msg, "执行完毕,耗时%0.2f" % (t_stop-t_start)) po = Pool(3) # 定义一个进程池,最大进程数3 for i in range(0, 10): # Pool().apply_async(要调用的目标, (传递给目标的参数元组,)) # 每次循环将会用空闲出来的子进程去调用目标 po.apply_async(worker, (i,)) print("--------start--------") po.close() # 关闭进程池,关闭后po不再接收新的请求 po.join() # 等待po中的所有子进程执行完成,必须放在close()语句之后 print("---------end----------") def downloader(img_name, img_url): req = urllib.request.urlopen(img_url) img_content = req.read() with open(img_name, "wb") as f: f.write(img_content) def main():
gevent_pool = Pool(20) urls = [ 'http://zhihu.com', 'http://facebook.com', 'http://zhaduixueshe.com', 'http://google.com', 'http://hao123.com', 'http://duoshuo.com', 'http://v2ex.com' ] url_list = [] for i in range(1, 500): url_list.append("http://tieba.baidu.com/p/2781190586?pn=" + str(i)) pool = Pool(2) start = time.time() pool.map_async(create_mission, url_list) pool.close() pool.join() print "multiprocessing used ", str(time.time() - start) start = time.time() jobs = [] for url in url_list: jobs.append(gevent.spawn(create_mission, url)) gevent.joinall(jobs) print "use gevent used ", time.time() - start start = time.time() gevent_pool.map(create_mission, url_list) #pool.join() print "use gevent_pool used ", str(time.time() - start)
ss += dd result = '\n'.join(ss) + '\n' with open('all_id.txt', 'a') as f: f.write(result) with open('finished.txt', 'a') as f: f.write(url1 + '\n') jishu += 1 print('=' * 20, jishu, '/', zongshu, '=' * 20) return except Exception as e: print(url1, e) pass with open('all_cat.txt') as f: all_cat = set([i.strip() for i in f.readlines()]) try: with open('finished.txt') as f: finish = set([i.strip() for i in f.readlines()]) except: finish = set() all_cat = all_cat - finish zongshu = len(all_cat) pp = Pool(200) pp.map(getid, all_cat) try: pp.close() pp.join() except: pass