コード例 #1
0
 def RUN(systemstyle):
     dcpoints_poc, revisepoints_poc, SQL_poc = None, None, None
     if SQL:
         if systemstyle in SQL:
             SQL_poc = SQL[systemstyle]
     if dcpoints:
         if systemstyle in dcpoints:
             dcpoints_poc = dcpoints[systemstyle]
     if revisepoints:
         if systemstyle in revisepoints:
             revisepoints_poc = revisepoints[systemstyle]
     con = OracleConn.Oracle_Connect(*SQLPWD[systemstyle],
                                     systemtag=systemstyle,
                                     log=log)
     data = con.PointAll(dcpoints_poc, revisepoints_poc, SQL_poc)
     dtxy_out = WebCheck.DTXY(systemstyle, log=log)
     dtxy_out.CheckWebStatus()
     pooloracle = ThreadPool(multi)
     poolgetdat = PoolGetDat(dtxy_out)
     oracle_res = pooloracle.map(poolgetdat, data)
     if str(type(pooloracle)).find('multiprocessing.pool.ThreadPool') > -1:
         pooloracle.close()
         pooloracle.join()
     filename = '{}.pickle'.format(systemstyle)
     if os.path.isdir(os.path.dirname(__file__)):
         thefilename = os.path.join(os.path.dirname(__file__), filename)
     else:
         thefilename = os.path.join(os.getcwd(), filename)
     with open(thefilename, 'wb') as f:
         pickle.dump(oracle_res, f)
     return oracle_res
def getgome(cat):
    for i in range(3):
        try:
            url = ''.join(('http://www.gome.com.cn/p/json?module=async_search&paramJson={%22pageNumber%22%3A', '1', '%2C%22envReq%22%3A{%22catId%22%3A%22', str(
                cat), '%22%2C%22regionId%22%3A%2231010100%22%2C%22et%22%3A%22%22%2C%22XSearch%22%3Afalse%2C%22pageNumber%22%3A1%2C%22pageSize%22%3A48}}'))
            r = requests.get(url)
            totalpage = int(r.json()['num']['totalPage'])
            urls = [''.join(('http://www.gome.com.cn/p/json?module=async_search&paramJson={%22pageNumber%22%3A', str(i), '%2C%22envReq%22%3A{%22catId%22%3A%22', str(
                cat), '%22%2C%22regionId%22%3A%2231010100%22%2C%22et%22%3A%22%22%2C%22XSearch%22%3Afalse%2C%22pageNumber%22%3A1%2C%22pageSize%22%3A48}}')) for i in xrange(1, totalpage + 1)]

            def ff(url):
                while 1:
                    try:
                        r = requests.get(url, timeout=3)
                        return '\n'.join([i['pId'] for i in r.json()['products']])
                    except:
                        continue
            pp = Pool(30)
            ss = pp.map(ff, urls)
            try:
                pp.close()
                pp.join()
            except:
                pass
            global jishu
            jishu += 1
            sys.stderr.write(str(jishu) + ' / ' + zongshu + '\r')
            return '\n'.join(ss) + '\n'
        except:
            continue
コード例 #3
0
ファイル: PoolRun.py プロジェクト: binai/AutoCheckMHSystem
 def RUN(systemstyle):
     dcpoints_poc,revisepoints_poc,SQL_poc=None,None,None
     if SQL:
         if systemstyle in  SQL:
             SQL_poc=SQL[systemstyle]
     if dcpoints:
         if systemstyle in dcpoints:
             dcpoints_poc=dcpoints[systemstyle]
     if revisepoints:
         if systemstyle in revisepoints:
             revisepoints_poc=revisepoints[systemstyle]
     con=OracleConn.Oracle_Connect(*SQLPWD[systemstyle],systemtag=systemstyle,log=log)
     data=con.PointAll(dcpoints_poc,revisepoints_poc,SQL_poc)
     dtxy_out=WebCheck.DTXY(systemstyle,log=log)
     dtxy_out.CheckWebStatus()
     pooloracle=ThreadPool(multi)
     poolgetdat=PoolGetDat(dtxy_out)
     oracle_res=pooloracle.map(poolgetdat,data)
     if str(type(pooloracle)).find('multiprocessing.pool.ThreadPool')>-1:
         pooloracle.close()
         pooloracle.join()
     filename='{}.pickle'.format(systemstyle)
     if os.path.isdir(os.path.dirname(__file__)):
         thefilename=os.path.join(os.path.dirname(__file__),filename)
     else:
         thefilename=os.path.join(os.getcwd(),filename)
     with open(thefilename,'wb') as f:
         pickle.dump(oracle_res,f)
     return oracle_res
def get_ids_by_cat(catid):
    while 1:
        try:
            url = 'http://list.taobao.com/itemlist/default.htm?_input_charset=utf-8&json=on&cat={0}&sort=biz30day&msp=1&as=1&viewIndex=1&atype=b&style=list&same_info=1&tid=0&isnew=2&pSize=96&data-key=s&data-value=1&data-action&module=page&s=0'.format(
                catid)
            r = requests.get(url, headers=headers, proxies=proxies, timeout=5)
            ss = r.text
            # print ss
            if '"itemList":null' in ss:
                return
            totalPage = int(re.findall('totalPage":"(\d+)"', ss)[0])
            print 'start', catid, '=' * 50, '\ntotalPage', totalPage
            pagenums = range(1, totalPage + 1)
            pp = Pool(5)
            ss = pp.map(lambda x: get_taobao_ids(catid, x), pagenums)
            try:
                pp.close()
                pp.join()
            except:
                pass
            ss = '\n'.join(ss) + '\n'
            with open('./aa/' + str(catid) + '.txt', 'w') as f:
                f.write(ss)
            jishu = len(glob.glob('./aa/*.*'))
            print jishu, '/', zongshu, 'completed'
            return
        except Exception as e:
            print('retry zong', catid, e)
            continue
コード例 #5
0
def getid(url1):
    while 1:
        try:
            url = url1 + 'all/----1--1---------.html'
            r = requests.get(url, headers=headers).text
            ss = re.findall('commid="(\d+)"', r)
            if not ss:
                global jishu
                jishu += 1
                print('=' * 20, jishu, '/', zongshu, '=' * 20)
                return
            pn = fromstring(r).xpath(
                '//div[@class="sort_page_num"]/span/text()')[0].replace('/', '')
            if pn == '1':
                result = '\n'.join(ss) + '\n'
            else:
                pns = range(2, int(pn) + 1)
                pp = Pool(30)
                dd = pp.map(lambda x: getbypn(url1, x), pns)
                try:
                    pp.close()
                    pp.join()
                except:
                    pass
                ss += dd
                result = '\n'.join(ss) + '\n'
            with open('all_id.txt', 'a') as f:
                f.write(result)
            global jishu
            jishu += 1
            print('=' * 20, jishu, '/', zongshu, '=' * 20)
            return
        except:
            pass
コード例 #6
0
def getid(url1):
    while 1:
        try:
            url = url1 + 'all/----1--1---------.html'
            r = requests.get(url, headers=headers).text
            ss = re.findall('commid="(\d+)"', r)
            if not ss:
                global jishu
                jishu += 1
                print('=' * 20, jishu, '/', zongshu, '=' * 20)
                return
            pn = fromstring(r).xpath(
                '//div[@class="sort_page_num"]/span/text()')[0].replace(
                    '/', '')
            if pn == '1':
                result = '\n'.join(ss) + '\n'
            else:
                pns = range(2, int(pn) + 1)
                pp = Pool(30)
                dd = pp.map(lambda x: getbypn(url1, x), pns)
                try:
                    pp.close()
                    pp.join()
                except:
                    pass
                ss += dd
                result = '\n'.join(ss) + '\n'
            with open('all_id.txt', 'a') as f:
                f.write(result)
            global jishu
            jishu += 1
            print('=' * 20, jishu, '/', zongshu, '=' * 20)
            return
        except:
            pass
コード例 #7
0
ファイル: PoolRun.py プロジェクト: binai/AutoCheckMHSystem
def PoolWebCon(SQLPWD,SQL,dcpoints,revisepoints,log=None):
    systemstyles=list(SQLPWD.keys())
    len_multi=len(systemstyles)
    len_submulti=int((psutil.cpu_count()-2)/(len_multi-1))
    if len_submulti<4:len_submulti=4
    poolweb=ThreadPool(len_submulti)
    pooloraclecon=PoolOracleCon(SQLPWD,SQL,dcpoints,revisepoints,len_submulti,log=log)
    web_res=poolweb.map(pooloraclecon,systemstyles)
    if str(type(poolweb)).find('multiprocessing.pool.ThreadPool')>-1:
        poolweb.close()
        poolweb.join()
    return web_res
コード例 #8
0
ファイル: get_jd.py プロジェクト: JayveeHe/OpinionRankProject
def get_jd_rate_all(pid):
    maxpn = get_jd_rate_totalpagenum(pid)
    if maxpn == -1:
        # print('null')
        return
    pp = Pool(100)
    result = pp.map(
        lambda x: get_jd_rate(x[0], x[1]), list(zip([pid] * (maxpn + 1), range(maxpn + 1))))
    try:
        pp.close()
        pp.join()
    except Exception, e:
        pass
コード例 #9
0
def get_jd_rate_all(pid):
    maxpn = get_jd_rate_totalpagenum(pid)
    if maxpn == -1:
        # print('null')
        return
    pp = Pool(100)
    result = pp.map(lambda x: get_jd_rate(x[0], x[1]),
                    list(zip([pid] * (maxpn + 1), range(maxpn + 1))))
    try:
        pp.close()
        pp.join()
    except:
        pass
    return result
コード例 #10
0
def get_ids_by_cat(catid):
    url = 'http://list.taobao.com/itemlist/default.htm?_input_charset=utf-8&json=on&cat={0}&sort=biz30day&msp=1&as=1&viewIndex=1&atype=b&style=list&same_info=1&tid=0&isnew=2&pSize=96&data-key=s&data-value=1&data-action&module=page&s=0'.format(
        catid)
    r = requests.get(url)
    ss = r.text
    totalPage = int(re.findall('totalPage":"(\d+)"', ss)[0])
    pagenums = range(1, totalPage + 1)
    pp = Pool(5)
    ss = pp.map(lambda x: get_taobao_ids(catid, x), pagenums)
    try:
        pp.close()
        pp.join()
    except:
        pass
    return '\n'.join(ss)
コード例 #11
0
def get_ids_by_cat(catid):
    url = 'http://list.taobao.com/itemlist/default.htm?_input_charset=utf-8&json=on&cat={0}&sort=biz30day&msp=1&as=1&viewIndex=1&atype=b&style=list&same_info=1&tid=0&isnew=2&pSize=96&data-key=s&data-value=1&data-action&module=page&s=0'.format(
        catid)
    r = requests.get(url)
    ss = r.text
    totalPage = int(re.findall('totalPage":"(\d+)"', ss)[0])
    pagenums = range(1, totalPage + 1)
    pp = Pool(5)
    ss = pp.map(lambda x: get_taobao_ids(catid, x), pagenums)
    try:
        pp.close()
        pp.join()
    except:
        pass
    return '\n'.join(ss)
コード例 #12
0
def PoolWebCon(SQLPWD, SQL, dcpoints, revisepoints, log=None):
    systemstyles = list(SQLPWD.keys())
    len_multi = len(systemstyles)
    len_submulti = int((psutil.cpu_count() - 2) / (len_multi - 1))
    if len_submulti < 4: len_submulti = 4
    poolweb = ThreadPool(len_submulti)
    pooloraclecon = PoolOracleCon(SQLPWD,
                                  SQL,
                                  dcpoints,
                                  revisepoints,
                                  len_submulti,
                                  log=log)
    web_res = poolweb.map(pooloraclecon, systemstyles)
    if str(type(poolweb)).find('multiprocessing.pool.ThreadPool') > -1:
        poolweb.close()
        poolweb.join()
    return web_res
コード例 #13
0
def getid(url1):
    global jishu
    while 1:
        try:
            url = url1
            r = requests.get(url, timeout=5).text
            ss = fromstring(r).xpath(
                '//div[@class="inner"]/p[@class="name"]/a/@href')
            ss = re.findall('dangdang\.com/(\d+)\.html', ''.join(ss))
            if not ss:
                with open('finished.txt', 'a') as f:
                    f.write(url1 + '\n')
                jishu += 1
                print('=' * 20, jishu, '/', zongshu, '=' * 20)
                return
            pn = fromstring(r).xpath(
                '//div[@name="Fy"]/span[3]/text()|//div[@class="page"]/span[3]/text()'
            )[0].replace('/', '')
            if pn == '1':
                result = '\n'.join(ss) + '\n'
            else:
                pns = range(2, int(pn) + 1)
                pp = Pool(50)
                dd = pp.map(lambda x: getbypn(url1, x), pns)
                try:
                    pp.close()
                    pp.join()
                except:
                    pass
                ss += dd
                result = '\n'.join(ss) + '\n'
            with open('all_id.txt', 'a') as f:
                f.write(result)
            with open('finished.txt', 'a') as f:
                f.write(url1 + '\n')
            jishu += 1
            print('=' * 20, jishu, '/', zongshu, '=' * 20)
            return
        except Exception as e:
            print(url1, e)
            pass
コード例 #14
0
def getid(url1):
    global jishu
    while 1:
        try:
            url = url1
            r = requests.get(url, timeout=5).text
            ss = fromstring(r).xpath(
                '//div[@class="inner"]/p[@class="name"]/a/@href')
            ss = re.findall('dangdang\.com/(\d+)\.html', ''.join(ss))
            if not ss:
                with open('finished.txt', 'a') as f:
                    f.write(url1 + '\n')
                jishu += 1
                print('=' * 20, jishu, '/', zongshu, '=' * 20)
                return
            pn = fromstring(r).xpath(
                '//div[@name="Fy"]/span[3]/text()|//div[@class="page"]/span[3]/text()')[0].replace('/', '')
            if pn == '1':
                result = '\n'.join(ss) + '\n'
            else:
                pns = range(2, int(pn) + 1)
                pp = Pool(50)
                dd = pp.map(lambda x: getbypn(url1, x), pns)
                try:
                    pp.close()
                    pp.join()
                except:
                    pass
                ss += dd
                result = '\n'.join(ss) + '\n'
            with open('all_id.txt', 'a') as f:
                f.write(result)
            with open('finished.txt', 'a') as f:
                f.write(url1 + '\n')
            jishu += 1
            print('=' * 20, jishu, '/', zongshu, '=' * 20)
            return
        except Exception as e:
            print(url1, e)
            pass
コード例 #15
0
ファイル: test.py プロジェクト: RanchoCooper/Zhihu_Crawler
    "http://facebook.com",
    "http://zhaduixueshe.com",
    "http://google.com",
    "http://hao123.com",
    "http://duoshuo.com",
    "http://v2ex.com",
]

url_list = []
for i in range(1, 500):
    url_list.append("http://tieba.baidu.com/p/2781190586?pn=" + str(i))

pool = Pool(2)
start = time.time()
pool.map_async(create_mission, url_list)
pool.close()
pool.join()
print "multiprocessing used ", str(time.time() - start)


start = time.time()
jobs = []
for url in url_list:
    jobs.append(gevent.spawn(create_mission, url))
gevent.joinall(jobs)
print "use gevent used ", time.time() - start

start = time.time()
gevent_pool.map(create_mission, url_list)

# pool.join()
コード例 #16
0
                except:
                    pass
                ss += dd
                result = '\n'.join(ss) + '\n'
            with open('all_id.txt', 'a') as f:
                f.write(result)
            with open('finished.txt', 'a') as f:
                f.write(url1 + '\n')
            jishu += 1
            print('=' * 20, jishu, '/', zongshu, '=' * 20)
            return
        except Exception as e:
            print(url1, e)
            pass
with open('all_cat.txt') as f:
    all_cat = set([i.strip() for i in f.readlines()])
try:
    with open('finished.txt') as f:
        finish = set([i.strip() for i in f.readlines()])
except:
    finish = set()
all_cat = all_cat - finish
zongshu = len(all_cat)
pp = Pool(200)
pp.map(getid, all_cat)
try:
    pp.close()
    pp.join()
except:
    pass
コード例 #17
0
    print("%s开始执行,进程号为%d" % (msg, os.getpid()))
    # random.random() 随机生成0~1之间的浮点数
    time.sleep(random.random() * 2)
    t_stop = time.time()
    print(msg, "执行完毕,耗时%0.2f" % (t_stop-t_start))


po = Pool(3)  # 定义一个进程池,最大进程数3
for i in range(0, 10):
    # Pool().apply_async(要调用的目标, (传递给目标的参数元组,))
    # 每次循环将会用空闲出来的子进程去调用目标
    po.apply_async(worker, (i,))


print("--------start--------")
po.close()  # 关闭进程池,关闭后po不再接收新的请求
po.join()  # 等待po中的所有子进程执行完成,必须放在close()语句之后
print("---------end----------")





def downloader(img_name, img_url):
    req = urllib.request.urlopen(img_url)
    img_content = req.read()
    with open(img_name, "wb") as f:
        f.write(img_content)


def main():
コード例 #18
0
gevent_pool = Pool(20)
urls = [
    'http://zhihu.com', 'http://facebook.com', 'http://zhaduixueshe.com',
    'http://google.com', 'http://hao123.com', 'http://duoshuo.com',
    'http://v2ex.com'
]

url_list = []
for i in range(1, 500):
    url_list.append("http://tieba.baidu.com/p/2781190586?pn=" + str(i))

pool = Pool(2)
start = time.time()
pool.map_async(create_mission, url_list)
pool.close()
pool.join()
print "multiprocessing used ", str(time.time() - start)

start = time.time()
jobs = []
for url in url_list:
    jobs.append(gevent.spawn(create_mission, url))
gevent.joinall(jobs)
print "use gevent used ", time.time() - start

start = time.time()
gevent_pool.map(create_mission, url_list)

#pool.join()
print "use gevent_pool used ", str(time.time() - start)
コード例 #19
0
                ss += dd
                result = '\n'.join(ss) + '\n'
            with open('all_id.txt', 'a') as f:
                f.write(result)
            with open('finished.txt', 'a') as f:
                f.write(url1 + '\n')
            jishu += 1
            print('=' * 20, jishu, '/', zongshu, '=' * 20)
            return
        except Exception as e:
            print(url1, e)
            pass


with open('all_cat.txt') as f:
    all_cat = set([i.strip() for i in f.readlines()])
try:
    with open('finished.txt') as f:
        finish = set([i.strip() for i in f.readlines()])
except:
    finish = set()
all_cat = all_cat - finish
zongshu = len(all_cat)
pp = Pool(200)
pp.map(getid, all_cat)
try:
    pp.close()
    pp.join()
except:
    pass