def getUrlList(sUrl, jUrl):
    #获取此sUrl中的10条nsrc记录
    html = urllib2.urlopen(sUrl).read()
    nsrcList = regex_nsrc.findall(html)
    #遍历获取到的10条nsrc记录
    jumpUrl = [jUrl + nsrc for nsrc in nsrcList]
    pool = threadPool(5)
    urlList = pool.map(getBaseUrl, jumpUrl)
    pool.close()
    pool.join()

    # for nsrc in nsrcList:
    # 	print ('.'),
    # 	#测试(jUrl+nsrc)的可访问性
    # 	#若跳转太多或访问出错
    # 	#则跳过此条(jUrl+nsrc)
    # 	html2 = getHtml(jUrl+nsrc)
    # 	if html2 == 'Error':
    # 		print html2
    # 		continue
    # 	#成功获取到跳到百度转码的网页(html2)
    # 	u = regex_url.search(html2)
    # 	if u:
    # 		urlList.append(u.group(1).replace('amp;',''))

    # 	#直接跳转到真实网页,由于不同的网站的
    # 	#网络环境不同会导致访问时间可能太长,故屏蔽
    # 	# else:
    # 	# 	u = urllib2.urlopen(jUrl+nsrc).geturl()
    # 	# 	urlList.append(u)

    #返回获取到的真实url
    return urlList
Beispiel #2
0
 def start(self,
           keywords,
           url_constructor,
           thread_num=10,
           search_handler=lambda k, t: [{
               'result': t
           }],
           result_handler=lambda k, t: (k, t)):
     self.start_time = time.time()
     self.search_handler = search_handler
     self.result_handler = result_handler
     keyword_urls = []
     results = []
     logging.info('Parser start')
     for keyword in keywords:
         keyword_urls.extend(url_constructor(keyword))
         logging.info('Construct url with {}'.format(keyword))
     logging.info('Start {} thread'.format(thread_num))
     pool = threadPool(thread_num)
     # multi thread debug
     # [results.extend(r) for r in map(self.request, keyword_urls) if r]
     [results.extend(r) for r in pool.map(self.request, keyword_urls) if r]
     logging.info('End all thread')
     pool.close()
     pool.join
     self.end_time = time.time()
     logging.info('Spend time {}'.format(self.time()))
     return filter(lambda t: bool(t), results)
Beispiel #3
0
def thControler(targetList):
    print 'starting checking threads.........'
    try:
        th = threadPool(8)
        th.map(check, targetList)
    except Exception as e:
        print e
    th.close()
    th.join()
Beispiel #4
0
def thControler(targetList):
    print 'starting checking threads.........'
    try:
        th=threadPool(8)
        th.map(check,targetList)
    except Exception as e:
        print e
    th.close()
    th.join()
def run_pool(target):
    '''
    param target: target function for each thread to execute
    '''

    pool = threadPool(50)
    pool.map_async(target, urls)
    pool.close()
    pool.join() 
def main():
    wd = 'inurl:php?id='
    wd = urllib.quote(wd)
    ksearchUrl = searchUrl.replace('keyWord', wd)
    for n in range(6, 7):
        n = n * 10
        currentSearchUrl = ksearchUrl.replace('pageNum', str(n))
        print currentSearchUrl
        urlList = getUrlList(currentSearchUrl, jumpUrl)
        autoChk = autoSqli('http://127.0.0.1:8775')
        chkPool = threadPool(3)
        rsts = chkPool.map(autoChk.run, urlList)
        pool.close()
        pool.join()
    return (0)
def main():
    wd = 'inurl:php?id='
    wd = urllib.quote(wd)
    ksearchUrl = searchUrl.replace('keyWord', wd)
    for n in range(11, 12):
        n = n * 10
        currentSearchUrl = ksearchUrl.replace('pageNum', str(n))
        print currentSearchUrl
        urlList = getUrlList(currentSearchUrl, jumpUrl)
        chkPool = threadPool(processes=3)
        for u in urlList:
            t = autoSqli('http://127.0.0.1:8775')
            chkPool.apply_async(t.run, (u, ))
        chkPool.close()
        chkPool.join()
    return (0)
def main():
	wd = 'inurl:php?id='
	wd = urllib.quote(wd)
	ksearchUrl = searchUrl.replace('keyWord',wd)
	for n in range(11,12):
		n = n * 10
		currentSearchUrl = ksearchUrl.replace('pageNum',str(n))
		print currentSearchUrl
		urlList = getUrlList(currentSearchUrl,jumpUrl)
		chkPool = threadPool(processes = 3)
		for u in urlList:
			t = autoSqli('http://127.0.0.1:8775')
			chkPool.apply_async(t.run,(u,))
		chkPool.close()
		chkPool.join()
	return (0)
    def _map(self, func, vals):
        """parallel mapping function

        Args:
            func (Function): to apply
            vals ([object]): list of values to apply to function

        Returns:
            ([object]) list of return values
        """
        cpuc = multiprocessing.cpu_count()
        pool = threadPool(cpuc if self.n_jobs <= -1 or self.n_jobs >= cpuc else self.n_jobs)

        vals = pool.map(func, vals)

        pool.close()
        pool.join()
        return vals
def getUrlList(sUrl,jUrl):
	#获取此sUrl中的10条nsrc记录
	html = urllib2.urlopen(sUrl).read()
	nsrcList = regex_nsrc.findall(html)
	#遍历获取到的10条nsrc记录
	jumpUrl = [jUrl+nsrc for nsrc in nsrcList]
	pool = threadPool(5)
	urlList = pool.map(getBaseUrl,jumpUrl)
	pool.close()
	pool.join()



	# for nsrc in nsrcList:
	# 	print ('.'),
	# 	#测试(jUrl+nsrc)的可访问性
	# 	#若跳转太多或访问出错
	# 	#则跳过此条(jUrl+nsrc)
	# 	html2 = getHtml(jUrl+nsrc)
	# 	if html2 == 'Error':
	# 		print html2
	# 		continue
	# 	#成功获取到跳到百度转码的网页(html2)
	# 	u = regex_url.search(html2)
	# 	if u:
	# 		urlList.append(u.group(1).replace('amp;',''))

	# 	#直接跳转到真实网页,由于不同的网站的
	# 	#网络环境不同会导致访问时间可能太长,故屏蔽
	# 	# else:
	# 	# 	u = urllib2.urlopen(jUrl+nsrc).geturl()
	# 	# 	urlList.append(u)



	#返回获取到的真实url
	return urlList
Beispiel #11
0
import time
import requests as re
from multiprocessing import Pool
from multiprocessing.dummy import Pool as threadPool

urls = [
    'http://www.python.org', 'http://www.python.org/about/',
    'http://www.onlamp.com/pub/a/python/2003/04/17/metaclasses.html',
    'http://www.python.org/doc/', 'http://www.python.org/download/',
    'http://www.python.org/getit/', 'http://www.python.org/community/',
    'https://wiki.python.org/moin/', 'http://planet.python.org/',
    'https://wiki.python.org/moin/LocalUserGroups',
    'http://www.python.org/psf/', 'http://docs.python.org/devguide/',
    'http://www.python.org/community/awards/'
]

start_time = time.time()
pool = threadPool(5)

results = pool.map(re.get, urls)
pool.close()
pool.join()

print('Done! Time taken: {}'.format(time.time() - start_time))
Beispiel #12
0
    pool.join()

#多线程
import os
import threading
from multiprocessing.dummy import Pool as threadPool


def f(x):
    pid = os.getpid()
    tid = threading.get_ident()
    print(pid, tid, x)


if __name__ == '__main__':
    tpool = threadPool(4)
    for i in range(1, 11):
        tpool.apply_async(f, args=[i])
    tpool.close()
    tpool.join()

#协程
import os
import threading
import asyncio


async def f(x):
    pid = os.getpid()
    tid = threading.get_ident()
    print(pid, tid, x)
Beispiel #13
0
def main():
    pool = threadPool(20)
    proxies = Proxy().get_proxies()
    with open('proxy.txt', 'w') as f:
        for proxy in filter(bool, pool.map(detect_alive_proxy, proxies)):
            f.write(proxy + '\n')
Beispiel #14
0
def multiThread(func, argList):
    connPool = threadPool(20)
    rtProcList = connPool.map(func, argList)
    connPool.close()
    connPool.join()
    return rtProcList
Beispiel #15
0
    for movie in movie_from_url(url):
        write_to_file(movie.json())


if __name__ == '__main__':
    # # 多进程
    # start = clock()
    # pool = Pool(1)
    # pool.map(main, [i * 10 for i in range(10)])
    # pool.close()
    # pool.join()
    # end = clock()
    # print(start, end)
    # print((end - start))
    # #
    # # 单进程
    # start = clock()
    # for i in range(10):
    #     main(i * 10)
    # end = clock()
    # print((end - start))
    # 多线程
    start = clock()
    pool = threadPool(1)
    pool.map(main, [i * 10 for i in range(10)])
    pool.close()
    pool.join()
    end = clock()
    print(start, end)
    print((end - start))