def port_test(self, port): #start spec port testing. sub_record = [] #this shuould using map to reduce time usage. pool = ThreadPool(len(self.urls)) results = pool.map(self.url_test, self.urls) return sum(results)
async def scrawl(self, threads=5): logger.log('Scrawling Trackemon..', 'green') await self.client.wait_until_ready() # get arrays channels id need to post shout_out_channels = [] for server in self.client.servers: for channel in server.channels: if channel.name in self.config.get('scrawl_channels', []): shout_out_channels.append(discord.Object(channel.id)) if len(shout_out_channels) == 0: raise Exception("No channel to shout out!") while not self.client.is_closed: logger.log('Scrawling Trackemon..', 'green') self._retrieve_session_id() # use multiprocessing if 'pokemons' in self.config.get('scrawl_trackemon'): pokemon_names = self.config.get('scrawl_trackemon')['pokemons'] pool = ThreadPool(threads) messages = pool.starmap(self.scrawl_trackemon, zip( pokemon_names, itertools.repeat(self.session_id))) for message in messages: if len(message): for channel in shout_out_channels: await self.client.send_message(channel, message) # increase delay to finish task await asyncio.sleep(self.config.get('delay_scrawl', 300))
def __init__(self,records,outfile): #cat_xml = "../itma.cat.soutron_20160216.xml" print 'Parsing XML data...' #records = etree.parse(cat_xml) self.counter = 0 self.roles = ['performer'] self.locations = [] cfields = [] refnos = [] self.records = records [[cfields.append(y) for y in x.xpath('*[self::Creator or self::Contributors]')] for x in records] [[self.locations.append(y) for y in x.xpath('GeographicalLocation/text()')] for x in records] map(self.parse_roles,cfields) self.roles = list(set([x.replace('\n','').strip() for x in filter(lambda x:len(x) > 1,self.roles)])) self.locations = list(set(self.locations)) self.role_list = etree.Element("NamedRoles") self.cache = {} print 'extracting roles...' pool = Pool(processes=4) pool.map(self.process_recordlist,records) return etree.ElementTree(self.role_list).write(outfile,pretty_print=True)
def assync_users_proceed(self, users_pool, threads): pool = ThreadPool(threads) try: full_users = pool.map(self.get_user_info, users_pool) except Exception, e: print e full_users = []
def download_urls_to_zip(zf, urls): urls = set(urls) pool = ThreadPool(10) download_to_zip_func = lambda url: download_url_to_zip(zf, url) pool.map(download_to_zip_func, urls)
def generate_k_clusters(self,folder,size): pool = ThreadPool(cpu_count()) self.size = size result = pool.map(self.read_image, folder) self.cluster = [r[0] for r in result if r[0]] self.data = [r[1] for r in result if r[1]] self.end = [r[2] for r in result if r[2]]
def test_multi_threading(): import time import random from multiprocessing.dummy import Pool def op_a(a, b): time.sleep(random.random()*.02) return a+b def op_b(c, b): time.sleep(random.random()*.02) return c+b def op_c(a, b): time.sleep(random.random()*.02) return a*b pipeline = compose(name="pipeline", merge=True)( operation(name="op_a", needs=['a', 'b'], provides='c')(op_a), operation(name="op_b", needs=['c', 'b'], provides='d')(op_b), operation(name="op_c", needs=['a', 'b'], provides='e')(op_c), ) def infer(i): # data = open("616039-bradpitt.jpg").read() outputs = ["c", "d", "e"] results = pipeline({"a": 1, "b":2}, outputs) assert tuple(sorted(results.keys())) == tuple(sorted(outputs)), (outputs, results) return results N = 100 for i in range(20, 200): pool = Pool(i) pool.map(infer, range(N)) pool.close()
def put_from_manifest( s3_bucket, s3_connection_host, s3_ssenc, s3_base_path, aws_access_key_id, aws_secret_access_key, manifest, bufsize, concurrency=None, incremental_backups=False, ): """ Uploads files listed in a manifest to amazon S3 to support larger than 5GB files multipart upload is used (chunks of 60MB) files are uploaded compressed with lzop, the .lzo suffix is appended """ bucket = get_bucket(s3_bucket, aws_access_key_id, aws_secret_access_key, s3_connection_host) manifest_fp = open(manifest, "r") buffer_size = int(bufsize * MBFACTOR) files = manifest_fp.read().splitlines() pool = Pool(concurrency) for _ in pool.imap( upload_file, ((bucket, f, destination_path(s3_base_path, f), s3_ssenc, buffer_size) for f in files) ): pass pool.terminate() if incremental_backups: for f in files: os.remove(f)
def main(): pool = ThreadPool(4) terms_to_articles = {} t0 = time() for term in search_terms: print("Getting articles for {}...".format(term)) article_urls = get_articles_urls_for(term) articles = pool.map(get_article, article_urls) terms_to_articles[term] = articles print("Fetching articles took {:.1f} seconds".format(time() - t0)) for term in search_terms: articles = terms_to_articles[term] print("Articles for {} ({}):".format(term, len(articles))) for article in articles: print(u" == {} ==".format(article.title)) print(u" {}...".format(article.text[:70])) print(u" - {}".format(article.url)) print with open('articles.pickle', 'wb') as f: pickle.dump(terms_to_articles, f)
def test_upload_chunk__expired_url(): upload_parts = [{'uploadPresignedUrl': 'https://www.fake.url/fake/news', 'partNumber': 420}, {'uploadPresignedUrl': 'https://www.google.com', 'partNumber': 421}, {'uploadPresignedUrl': 'https://rito.pls/', 'partNumber': 422}, {'uploadPresignedUrl': 'https://never.lucky.gg', 'partNumber': 423} ] value_doesnt_matter = None expired = Value(c_bool, False) mocked_get_chunk_function = MagicMock(side_effect=[1, 2, 3, 4]) with patch.object(multipart_upload, "_put_chunk", side_effect=SynapseHTTPError("useless message", response=MagicMock(status_code=403))) as mocked_put_chunk, \ patch.object(warnings, "warn") as mocked_warn: def chunk_upload(part): return _upload_chunk(part, completed=value_doesnt_matter, status=value_doesnt_matter, syn=syn, filename=value_doesnt_matter, get_chunk_function=mocked_get_chunk_function, fileSize=value_doesnt_matter, partSize=value_doesnt_matter, t0=value_doesnt_matter, expired=expired, bytes_already_uploaded=value_doesnt_matter) # 2 threads both with urls that have expired mp = Pool(4) mp.map(chunk_upload, upload_parts) assert_true(expired.value) # assert warnings.warn was only called once mocked_warn.assert_called_once_with("The pre-signed upload URL has expired. Restarting upload...\n") # assert _put_chunk was called at least once assert_greater_equal(len(mocked_put_chunk.call_args_list), 1)
def BurstDz(host, path, user, passfile): hostuser = host.split('.') hostuser = hostuser[len(hostuser)-2] hostdir = [hostuser,hostuser+hostuser,'admin'+hostuser,hostuser+'123','manage'+hostuser,hostuser+'123456',hostuser+'admin','123'+hostuser] opts_list = [] f = open(passfile, 'r') password = f.read().split() dic = password+hostdir pool = ThreadPool(10) host1 = host+path for x in range(len(dic)): mima = dic[x] opts = { 'host': host1, 'user': user, 'password': mima } opts_list.append(opts) #print hostr #print result pool.map(LoginDisCuz, opts_list) #pool.join() print 'All PassWord Run Over'
def format_data(dealer_data): start_time = time.time() pool = Pool(1) dealers=[] today = datetime.now() for data in dealer_data: temp = {} temp['id'] = data[0] temp['service_advisor_id'] = data[1] temp['name'] = data[2] temp['phone_number'] = data[3] temp['order'] = data[4] temp['password'] = data[1]+'@123' temp['last_login'] = today temp['is_superuser'] = 0 temp['username'] = data[1] temp['first_name'] = ' ' temp['last_name'] = ' ' temp['email'] = '' temp['is_staff'] = 0 temp['is_active'] = 1 temp['date_joined'] = today dealers.append(temp) pool.map(process_query, dealers) end_time = time.time() print "..........Total TIME TAKEN.........", end_time-start_time
def _get_item_data(self, itemids, threads=-1): """ Get metadata for many items. :param itemids: item numbers :param threads: number of cpu threads to use :type itemids: list :type threads: int """ self.info('getting data') self.info('threads = %d', threads) # threads make it faster but I've seen it freeze so disabling this for now if threads > 1: threads = 0 self.error('multiprocessing seems fishy') self.error('setting threads=1') # get data from itemids if threads > 1: from multiprocessing.dummy import Pool as ThreadPool import itertools params = zip(itemids, range(len(itemids)), itertools.repeat(len(itemids))) pool = ThreadPool(threads) data = pool.map(self._get_item_data_for_itemid_map, params) data = {d['itemid'] : d for d in data} else: data = {} for i, itemid in enumerate(itemids): data[itemid] = self._get_item_data_for_itemid(itemid, index=i, total=len(itemids)) return data
def getAllProducts(self): """ multithreaded returns a dictionary of information {skus} skus is a dictionary with many keys and values refer to output.txt to see what information it holds """ skus = {} page = 1 num_pages = 8 r = None found_empty = False pool = ThreadPool(num_pages) while not found_empty: pages = range(page, page + num_pages) results = pool.map(lambda x: self._listProducts(page=x), pages) # print(results) for r in results: if str(r.status_code) == "204": found_empty = True break if str(r.status_code).startswith("4"): raise Exception("Error {}: {}.".format(r.status_code, BigCommerce.error_codes[int(r.status_code)])) temp_data = r.json() for item in temp_data: sku = item["sku"] skus[sku] = item page += 1 return {"skus": skus}
def update_proxy_pool(test_url, timeout, proxy_pool, ready_flag, interval): """ 守护进程执行的任务,定时更新代理池。 注意每次更新本身需要十几秒的时间, 所谓定时,是规定更新间隔时间。 """ while 1: proxy_list = get_proxies(test_url, timeout) # 获取新代理列表 # 筛选不在新代理列表中的旧代理 pre_test_list = proxy_pool.keys() pre_test_list.remove(None) for proxy in proxy_list: if proxy in proxy_pool: # 如果该旧代理在新代理列表中,不测试该代理 pre_test_list.remove(proxy) # 测试旧代理,弃用响应太慢的旧代理 if len(pre_test_list) > 0: pool = Pool(16) # 创建线程池 kwargs = [{'test_url': test_url, 'proxy': proxy, 'timeout': timeout} for proxy in pre_test_list] # 封装参数 response_time_list = pool.map(multi_test_wrapper, kwargs) # 并行测试 for i in xrange(len(pre_test_list)): # 弃用响应太慢的旧代理 if response_time_list[i] > timeout: del proxy_pool[pre_test_list[i]] # 合并新旧代理列表 for proxy in proxy_list: if proxy not in proxy_pool: # 如果新代理不在代理池中,初始化新代理 proxy_pool[proxy] = 0 ready_flag.value = True print('代理池更新完成,当前代理池中有', len(proxy_pool), '个代理') sleep(interval) # 定时更新一次代理列表
def main(): mht_list = get_mht_list() if not mht_list: print u'请确保目录下有mht文件\n' return print u'共有%s个mht文件中的图片需要备份\n'%len(mht_list) print u'请输入你的QQ号码(6-10位纯数字):' qq = raw_input() print u'正在搜索mht文件中待备份图片,请稍后....' get_mht_pic(mht_list) if not mht_pic_md5: print u'mht文件中未包含可备份图片\n' return print u'共找到%s张待备份图片'%len(mht_pic_md5) # QQ图片文件夹 documents_path = os.getenv('USERPROFILE') + os.sep + '\Documents' img_path = documents_path + os.sep + 'Tencent Files/' + qq + '/Image' print u'正在统计QQ聊天记录图片, 请稍后....' pic_list = get_pic_list(img_path) if not pic_list: print u'未找到QQ聊天记录图片文件夹,请确保输入了正确的QQ号码\n' main() pool = ThreadPool(thread_num) print u'正在备份....' pool.map(backup, pic_list) print u'备份完成\n图片保存在当前路径的bak文件夹下\n'
def runLocalCommands(args, outputDir, commands): # NOTE: this is going to BREAK meff optimisation if we re-cycle histograms. # Needs to be updated to run in successive orde if we implement that. N = len(commands) if N > 50: print("") print("Are you sure you want to run %d commands locally?" % N) if args.dry_run: print("[NB: this is a dry run]") var = input("Press enter to continue") print("") cmds = [] for i, x in enumerate(commands): (cuts, name, cmd) = x cmd = "cd %s && echo '%d/%d\t%s' && %s 2>&1 >/dev/null" % (outputDir, i+1, N, cmd, cmd) cmds.append(cmd) if args.dry_run: print("Would run following commands:") for cmd in cmds: print(" %s" % cmd) return pool = Pool(10) # concurrent commands at a time for i, returncode in enumerate(pool.imap(partial(subprocess.call, shell=True), cmds)): if returncode != 0: print(("%d command failed: %d" % (i, returncode)))
def test_generate_in_progress_resizer_option_true( redis_cache, resizetarget_opts, image1_data, image1_name, tmpdir ): config = Config( root=str(tmpdir), url='/', redis_host=redis_cache.redis, raise_on_generate_in_progress=True ) resizer = flask_resize.make_resizer(config) # Save original file resizer.storage_backend.save(image1_name, image1_data) def run(x): return resizer(image1_name) pool = Pool(2) with pytest.raises(flask_resize.exc.GenerateInProgress): pool.map(run, [None] * 2)
def test_multithread(self): logger = HitLogger(300) def single_thread_process(logger): time.time = mock.Mock(return_value=0) logger.log_hit() time.sleep(1) time.time = mock.Mock(return_value=10) logger.get_hits() logger.log_hit() logger.get_hits() logger.log_hit() time.sleep(1) time.time = mock.Mock(return_value=11) logger.get_hits() time.sleep(1) time.time = mock.Mock(return_value=100) logger.log_hit() time.sleep(1) time.time = mock.Mock(return_value=200) logger.log_hit() logger.get_hits() time.sleep(1) time.time = mock.Mock(return_value=300) logger.log_hit() logger.get_hits() time.sleep(1) time.time = mock.Mock(return_value=310) logger.get_hits() pool = Pool(5) pool.map(single_thread_process, [ logger ] * 5) input('Press any key to exit...')
def run(node): """ Primary entry-point for running this module. :param node: dict { "url": "https://some-site.com" } :return: { document_url: metadata, ... } :rtype: dict """ mapper = lambda x: redis_load(x, r) url = node.get('url', 'http://www.cic.gc.ca') pool = ThreadPool(32) docs = redis_docs(url, r) metadata = pool.map(mapper, docs) return { url2pathname(k): v for k,v in metadata if v }
def test_threading(self): pool = ThreadPool(4) results = pool.map(self.parser.parse, Dictionary().version.terms) self.assertSetEqual({str(t) for t in results}, {'[{0}]'.format(str(t)) for t in Dictionary().version.terms}) results = pool.map(script, Dictionary().version.terms) self.assertSetEqual({str(t) for t in results}, set(Dictionary().version.terms))
def multithread(function, items, extra_variable, threads=2): """ Takes the main function to run in parallel, inputs the variable(s) and returns the results. :param function: The main function to process in parallel. :param items: A list of strings that are passed into the function for each thread. :param extra_variable: One additional variable that can be passed into the function. :param threads: The number of threads to use. The default is 2, but the threads are not CPU core bound. :return: The results of the function passed into this function. """ if __name__ == '__main__': # """ A CPU core dependent multiprocessing technique. # The synchronized variant, which locks the main program until a process is finished. Order is retained. """ # pool = Pool(threads) # results = [pool.apply(function, args=(item, extra_variable)) for item in items] # pool.close() # pool.join() # """ A thread dependent multiprocessing technique. Theoretically, an unlimited number of threads can be used. # The synchronized variant, which locks the main program until a process is finished. Order is retained. """ # pool = ThreadPool(threads) # results = [pool.apply(function, args=(item, extra_variable)) for item in items] # pool.close() # pool.join() """ A thread dependent multiprocessing technique. Theoretically, an unlimited number of threads can be used. The async variant, which submits all processes at once and retrieve the results as soon as finished. """ pool = ThreadPool(threads) output = [pool.apply_async(function, args=(item, extra_variable)) for item in items] results = [p.get() for p in output] return results
def test_thread(data_array, word_list): def test_update_line(line): if len(line) == 1: return line else: for i in range(len(word_list)): for j in range(len(line)-1): if line[j] == word_list[i][0] and line[j+1] == word_list[i][1]: line[j] = line[j] + line[j+1] line[j+1] = '' return line print data_array IS_MUTI_THREAD = True MUTI_THREAD_NUM = 3 if IS_MUTI_THREAD: from multiprocessing.dummy import Pool as ThreadPool if IS_MUTI_THREAD: pool = ThreadPool(MUTI_THREAD_NUM) pool.map(test_update_line, data_array) data_array = [filter(lambda x:x!='',line) for line in data_array] else: # for i in range(len(data_array)): # data_array[i] = filter(lambda x:x!='', test_update_line(data_array[i])) data_array = [filter(lambda x:x!='', test_update_line(line)) for line in data_array] print data_array
def main(): mht_list = get_mht_list() if not mht_list: print u'请确保目录下有mht文件\n' return print u'共有%s个mht文件中的图片需要备份\n'%len(mht_list) print u'请输入你的QQ号码(6-10位纯数字):' qq = raw_input() print u'正在搜索mht文件中待备份图片,请稍后....' get_mht_pic(mht_list) if not mht_pic_md5: print u'mht文件中未包含可备份图片\n' return print u'共找到%s张待备份图片'%len(mht_pic_md5) # QQ图片文件夹 key = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, r'Software\Microsoft\Windows\CurrentVersion\Explorer\User Shell Folders') documents_path = _winreg.QueryValueEx(key, 'Personal')[0] img_path = documents_path + os.sep + 'Tencent Files/' + qq + '/Image' print u'正在统计QQ聊天记录图片, 请稍后....' pic_list = get_pic_list(img_path) if not pic_list: print u'未找到QQ聊天记录图片文件夹,请确保输入了正确的QQ号码\n' main() pool = ThreadPool(thread_num) print u'正在备份....' pool.map(backup, pic_list) print u'备份完成\n图片保存在当前路径的bak文件夹下\n'
def audio_convert(filename): # This combines the cutting and the conversions cut_files = {} text = {} error_file = open('error.txt', 'w') error_file.write(filename) for speed in ['slow', 'fast']: if speed == 'slow': cut_files[speed] = cut_wave(filename, 0.70) else: cut_files[speed] = cut_wave(filename, 0.85) # assert(False) pool = ThreadPool(processes = len(cut_files[speed])) text[speed] = pool.map(chunk_convert, cut_files[speed]) pool.close() # text[speed] = [chunk_convert(x) for x in cut_files[speed]] print "Closed a pool" # Clear out the temporary files created for x in cut_files[speed]: os.remove(x) text = text['slow'] + text['fast'] text = [x for x in text if len(x) > 0] return(text)
def find_process_files(root_dir): lock = Lock() pool = Pool() hash_db = load_hashes(HASH_FILE) # Keep changed .pxi hashes in a separate dict until the end # because if we update hash_db and multiple files include the same # .pxi file the changes won't be detected. pxi_hashes = {} jobs = [] for cur_dir, dirs, files in os.walk(root_dir): for filename in files: in_file = os.path.join(cur_dir, filename + ".in") if filename.endswith('.pyx') and os.path.isfile(in_file): continue for fromext, function in rules.items(): if filename.endswith(fromext): toext = ".c" with open(os.path.join(cur_dir, filename), 'rb') as f: data = f.read() m = re.search(br"^\s*#\s*distutils:\s*language\s*=\s*c\+\+\s*$", data, re.I|re.M) if m: toext = ".cxx" fromfile = filename tofile = filename[:-len(fromext)] + toext jobs.append((cur_dir, fromfile, tofile, function, hash_db, pxi_hashes, lock)) for result in pool.imap(lambda args: process(*args), jobs): pass hash_db.update(pxi_hashes) save_hashes(hash_db, HASH_FILE)
def abortable_func(func, *args, **kwargs): """ The abortable_func is the wrapper function, which wraps around function type "func", call it in a background thread (multiprocessing.dummy.Thread), and terminates it after "timeout" seconds. This function is inspired by http://stackoverflow.com/questions/29494001/how-can-i-abort-a-task-in-a-multiprocessing-pool-after-a-timeout but is an improvement over the original solution, since the original solution is only applicable to a function that takes positional arguments. Parameters of the function: func - the function that will be called and terminated if not return with "timeout" seconds *args - positional arguments of "func" **kwargs - named arguments of "func" + "timeout" value """ #- Get "timeout" value and create a ThreadPool (multiprocessing.dummy.Pool) # with only 1 worker. #- Use functools.partial (https://docs.python.org/3/library/functools.html) # to fit all the arguments of the func into the interface of # Pool.apply_async function timeout = kwargs.pop('timeout', None); p = ThreadPool(1); partial_func = partial(func,**kwargs); res = p.apply_async(partial_func,args); #- Terminate the thread if it does not return after "timeout" seconds # otherwise return the returned value of func try: out = res.get(timeout); return out except TimeoutError: p.terminate() return "{}:Timeout exceeded. Process terminated.\r\n".format(args[0]);
def start(): CSVFile(header=['Artist', 'Album', 'Genre', 'Style', 'Year', 'Rating']) page = 1 page_not_found = None while page_not_found == None: try: print('Page', page) pitchfork_page = Grab() pitchfork_page.go(PITC_URL + str(page)) soup = Soup(pitchfork_page.doc.select('//div[@id="main"]/ul[@class="object-grid "]').html(), 'lxml') albums_on_page = [] for link in soup.find_all('a', href=True): albums_on_page.append('http://pitchfork.com' + link['href']) pool = ThreadPool(THREADS) pool.map(pitchfork, albums_on_page) page += 1 # if page > 1: # page_not_found = True except IndexError as error: print(error) page_not_found = True
def main(): n = 100000 m = 10 m2 = 1000 create_db() pool = Pool(processes=5) start = time.time() fill(n) fill_time = time.time() - start print('{} inserts in {}s'.format(n,fill_time)) db = get_db() print(db.directories.find().count(),'directories') start = time.time() results = [] for _ in range(m): results.append(pool.apply_async(read, ())) # results.append(pool.apply_async(read_dataset, ())) for i in range(m2): results.append(pool.apply_async(read_one, ())) # if i%10 == 0: # results.append(pool.apply_async(fill, (1,))) for r in results: r.get(timeout=1000000) read_time = time.time() - start pool.terminate() print('{}.{} reads in {}s'.format(m,m2,read_time))
def spider(): # initialize the count global COUNT global TOTAL COUNT = 0 # connect the local mongodb conn = pymongo.Connection( ) db = conn.adsdata logfile = open("./runtime.log", "a") enzyme_content = KEGG.getList('enzyme') enzyme_lines = enzyme_content.split('\n') TOTAL = len(enzyme_lines) print('TOTAL: ' + str(TOTAL)) enzyme_ids = map(lambda line: line.split('\t')[0], enzyme_lines) ## multithread inserting pool = ThreadPool(10) try: pool.map(lambda id:insertEnzymeTreeWith_safe(id, db), enzyme_ids ) except Exception,e: print("Error: " + e.message)
# Main start = timeit.default_timer() session1 = requests.Session() response1 = session1.get('http://www.supremenewyork.com/shop/all') soup1 = bs(response1.text, 'html.parser') links1 = soup1.find_all('a', href=True) links_by_keyword1 = [] for link in links1: for keyword in keywords_category: if keyword in link['href']: links_by_keyword1.append(link['href']) pool1 = ThreadPool(len(links_by_keyword1)) nosession = True while nosession: print('Finding matching products...') result1 = pool1.map(product_page, links_by_keyword1) for session in result1: if not session is None: nosession = False checkout(session) break stop = timeit.default_timer() print(stop - start) # Get the runtime
class QiushiSpider(): def __init__(self): self.url = 'https://www.qiushibaike.com/8hr/page/{}/' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } self.url_q = Queue() self.pool = Pool(5) self.total_response_nums = 0 self.total_request_nums = 0 self.is_running = True def makeUrlList(self): """构造所有的url放入队列""" for i in range(1, 14): self.url_q.put(self.url.format(i)) self.total_request_nums += 1 # 请求数 + 1 (url数 + 1) def getHtml(self, url): """对一个url发送请求获取响应并返回""" resp = requests.get(url, headers=self.headers) return resp.text def parseItem(self, html_str): """提取一个响应中的数据,并返回多条数据构成的list""" html = etree.HTML(html_str) # 先分组,再提取 div_list = html.xpath('//div[@id="content-left"]/div') result_list = [] # 构造最终返回的结果列表 for div in div_list: item = {} item['name'] = div.xpath('.//h2/text()')[0].strip() # 用户昵称 item['text'] = div.xpath( './/div[@class="content"]/span/text()') # 主要内容 # print(item) result_list.append(item) # print(result_list) return result_list def saveResultList(self, result_list): """保存一个响应中的多条数据组成的列表""" # print(result_list) for item in result_list: print(item) def excute_requests_item_save(self): """从队列中拿出一个url,直到处理完成""" url = self.url_q.get() html_str = self.getHtml(url) result_list = self.parseItem(html_str) self.saveResultList(result_list) self.total_response_nums += 1 # 总响应数 + 1 def _callback(self, xxx): # callback指定的函数必须接收一个参数,哪怕用不上! print(xxx) # xxx就是excute_requests_item_save这个函数的返回值 """apply_async异步执行的函数执行完毕后,会把该函数返回的结果作为参数传入callback指定的函数中""" if self.is_running: self.pool.apply_async(self.excute_requests_item_save, callback=self._callback) def run(self): """爬虫运行逻辑""" # 构造url队列 self.makeUrlList() # 利用线程池中的线程异步的不断的去执行:处理一个url直到处理完毕 for i in range(5): # 这才是控制并发的规模! self.pool.apply_async(self.excute_requests_item_save, callback=self._callback) while 1: time.sleep(1) # 一定也要睡一会儿!不然太快导致醒不过来! # 程序退出的逻辑: 总响应数 == url总数 --> 程序退出 if self.total_response_nums >= self.total_request_nums: print('=') print(self.total_request_nums) print(self.total_response_nums) print(self.url_q.qsize()) print('=') self.is_running = False break print('程序结束了!')
def crawlToCSV(URLrecord): OpenSomeSiteURL = urllib2.urlopen(URLrecord) Soup_SomeSite = BeautifulSoup(OpenSomeSiteURL, "lxml") OpenSomeSiteURL.close() tbodyTags = Soup_SomeSite.find("tbody") trTags = tbodyTags.find_all("tr", class_="result-item ") placeHolder = [] for trTag in trTags: tdTags = trTag.find("td", class_="result-value") tdTags_string = tdTags.string placeHolder.append(tdTags_string) return placeHolder if __name__ == "__main__": fileName = "SomeSiteValidURLs.csv" pool = Pool(cpu_count() * 2) # Creates a Pool with cpu_count * 2 threads. with open(FileName, "rb") as f: results = pool.map( crawlToCSV, f ) # results is a list of all the placeHolder lists returned from each call to crawlToCSV with open("Output.csv", "ab") as f: writeFile = csv.writer(f) for result in results: writeFile.writerow(result)
def parallel_apply(func, iterable, workers, max_queue_size, callback=None, dummy=False): """多进程或多线程地将func应用到iterable的每个元素中。 注意这个apply是异步且无序的,也就是说依次输入a,b,c,但是 输出可能是func(c), func(a), func(b)。 参数: dummy: False是多进程/线性,True则是多线程/线性; callback: 处理单个输出的回调函数; """ if dummy: from multiprocessing.dummy import Pool, Queue else: from multiprocessing import Pool, Queue in_queue, out_queue = Queue(max_queue_size), Queue() def worker_step(in_queue, out_queue): # 单步函数包装成循环执行 while True: d = in_queue.get() r = func(d) out_queue.put(r) # 启动多进程/线程 pool = Pool(workers, worker_step, (in_queue, out_queue)) if callback is None: results = [] # 后处理函数 def process_out_queue(): out_count = 0 for _ in range(out_queue.qsize()): d = out_queue.get() out_count += 1 if callback is None: results.append(d) else: callback(d) return out_count # 存入数据,取出结果 in_count, out_count = 0, 0 for d in iterable: in_count += 1 while True: try: in_queue.put(d, block=False) break except six.moves.queue.Full: out_count += process_out_queue() if in_count % max_queue_size == 0: out_count += process_out_queue() while out_count != in_count: out_count += process_out_queue() pool.terminate() if callback is None: return results
url = f'http://game.thronemaster.net/?game={game_log_id}&show=log' try: file_name = 'game_logs/' + str( url[url.find('game=') + 5:].split('&')[0]) r = requests.get(url) with open(file_name, 'wb') as file: file.write(r.content) except Exception as e: if game_log_id % 100 == 0: print('FAILED') print(e) time.sleep(0.33) if game_log_id % 100 == 0: print('finished') return def listdir_nohidden(path): for f in os.listdir(path): if not f.startswith('.'): yield f if __name__ == '__main__': pool = ThreadPool(4) game_ids = range(80000, 140000) downloaded_logs = listdir_nohidden(os.getcwd() + '/game_logs/') downloaded_logs = [int(log) for log in downloaded_logs] game_ids = [x for x in game_ids if x not in downloaded_logs] print(len(game_ids)) results = pool.map(downloader, game_ids)
print('-' * 60) print('Please wait, scanning remote host ', remote_server_ip) print('-' * 60) socket.setdefaulttimeout(0.5) def scan_port(port): try: s = socket.socket(2, 1) res = s.connect_ex((remote_server_ip, port)) if res == 0: # 如果端口开启 发送 hello 获取banner print('Port {}: OPEN'.format(port)) s.close() except Exception as e: print(str(e)) ports = [i for i in range(1, 1025)] # Check what time the scan started t1 = datetime.now() pool = ThreadPool(processes=16) results = pool.map(scan_port, ports) pool.close() pool.join() print('Multiprocess Scanning Completed in ', datetime.now() - t1)
response = requests.get( BASE_URL + term, params={'key': API_KEY}, ) # <6> except requests.HTTPError as err: print(err) return [] else: data = response.json() # <7> parts_of_speech = [] for entry in data: # <5> if isinstance(entry, dict): meta = entry.get("meta") if meta: part_of_speech = entry.get("fl") if part_of_speech: parts_of_speech.append(part_of_speech) return sorted(set(parts_of_speech)) # <8> p = Pool(POOL_SIZE) # <9> results = p.map(fetch_data, search_terms) # <10> for search_term, result in zip(search_terms, results): # <11> print("{}:".format(search_term.upper())) if result: print(result) else: print("** no results **")
def run_speedtest(args, conf): """Initializes all the data and threads needed to measure the relays. It launches or connect to Tor in a thread. It initializes the list of relays seen in the Tor network. It starts a thread to read the previous measurements and wait for new measurements to write them to the disk. It initializes a class that will be used to order the relays depending on their measurements age. It initializes the list of destinations that will be used for the measurements. It initializes the thread pool that will launch the measurement threads. The pool starts 3 other threads that are not the measurement (worker) threads. Finally, it calls the function that will manage the measurement threads. """ global rd, pool, controller controller, _ = stem_utils.init_controller( path=conf.getpath('tor', 'control_socket')) if not controller: controller = stem_utils.launch_tor(conf) else: log.warning( 'Is sbws already running? ' 'We found an existing Tor process at %s. We are not going to ' 'launch Tor, nor are we going to try to configure it to behave ' 'like we expect. This might work okay, but it also might not. ' 'If you experience problems, you should try letting sbws launch ' 'Tor for itself. The ability to use an already running Tor only ' 'exists for sbws developers. It is expected to be broken and may ' 'even lead to messed up results.', conf.getpath('tor', 'control_socket')) time.sleep(15) # When there will be a refactor where conf is global, this can be removed # from here. state = State(conf.getpath('paths', 'state_fname')) # XXX: tech-debt: create new function to obtain the controller and to # write the state, so that a unit test to check the state tor version can # be created # Store tor version whenever the scanner starts. state['tor_version'] = str(controller.get_version()) # Call only once to initialize http_headers settings.init_http_headers(conf.get('scanner', 'nickname'), state['uuid'], state['tor_version']) # To do not have to pass args and conf to RelayList, pass an extra # argument with the data_period measurements_period = conf.getint('general', 'data_period') rl = RelayList(args, conf, controller, measurements_period, state) cb = CB(args, conf, controller, rl) rd = ResultDump(args, conf) rp = RelayPrioritizer(args, conf, rl, rd) destinations, error_msg = DestinationList.from_config( conf, cb, rl, controller) if not destinations: fail_hard(error_msg) max_pending_results = conf.getint('scanner', 'measurement_threads') pool = Pool(max_pending_results) try: main_loop(args, conf, controller, rl, cb, rd, rp, destinations, pool) except KeyboardInterrupt: log.info("Interrupted by the user.") stop_threads(signal.SIGINT, None) # Any exception not catched at this point would make the scanner stall. # Log it and exit gracefully. except Exception as e: log.critical(FILLUP_TICKET_MSG) log.exception(e) stop_threads(signal.SIGTERM, None, 1)
def main(artist_list, output_dir): start = time.time() global num_images print('gathering links to images...') # Create a threadpool with one less than the total number of CPUs of the machine # this allows fast multiprocessing w/out overloading the machine threadpool = Pool(multiprocessing.cpu_count() - 1) # Use imap to run the get_painting_list_by_artist function # different threads get different first inputs (numbers) # but typep and searchword are repeated wikiart_pages = threadpool.imap(get_painting_list_by_artist, zip(artist_list)) # Close and join the threadpool as per recommended usage threadpool.close() threadpool.join() # Convert the wikiart iterator into pages and then items list pages = [page for page in wikiart_pages if page] items = [item for sublist in pages for item in sublist] num_images = len(items) # Create the output_dir if not os.path.isdir('%s/' % (output_dir)): os.mkdir('%s/' % (output_dir)) threadpool = Pool(multiprocessing.cpu_count() - 1) # Download images failed_urls = [] print('attempting to download %d images' % num_images) threadpool.starmap( downloader, zip(enumerate(items), itertools.repeat(output_dir), itertools.repeat(failed_urls))) threadpool.close() threadpool.join() print("Took %d seconds to complete the first run." % (time.time() - start)) print("Attempting to gather the failed URLs") # Fix failed urls corrected_urls = [] for i, url in enumerate(failed_urls): #Show progress if i % 100 == 0: print(str(i) + " out of " + str(len(failed_urls))) # Extract the correct part of the url url = url[1] extracted = re.findall(r'/images/(.*?).jpg', url) full_url = "https://www.wikiart.org/en/" + extracted[0] # Get the soup from the page soup = BeautifulSoup(urllib.request.urlopen(full_url), "lxml") # Try finding the correct link with .jpg regex = r'<meta content="https://uploads[0-9].wikiart.org(.*?).jpg' corrected_link = re.search(regex, str(soup.html())) success = False if corrected_link: # Get the url part of the corrected link (drop <meta content... etc.) full_correct_url = corrected_link.group(0)[15:] corrected_urls.append(full_correct_url) success = True # If jpg doesn't work else: # Try other file formats reg_list = [ r'<meta content="https://uploads[0-9].wikiart.org(.*?).png', r'<meta content="https://uploads[0-9].wikiart.org(.*?).jpeg', r'<meta content="https://uploads[0-9].wikiart.org(.*?).Jpeg', r'<meta content="https://uploads[0-9].wikiart.org(.*?).JPG', r'<meta content="https://uploads[0-9].wikiart.org(.*?).PNG', ] for regex in reg_list: if success == False: corrected_link = re.search(regex, str(soup.html())) if corrected_link: full_correct_url = corrected_link.group(0)[15:] corrected_urls.append(full_correct_url) success = True # If it fails again, move on if success == False: print("fail: " + full_url) print("Downloading corrected URLs") threadpool = Pool(multiprocessing.cpu_count() - 1) failed_urls_v2 = [] threadpool.starmap( downloader, zip(enumerate(corrected_urls), itertools.repeat(output_dir), itertools.repeat(failed_urls_v2))) # close and join the threadpool as per recommended usage threadpool.close() threadpool.join() print("Took %d seconds to complete." % (time.time() - start))
conn = MySQLdb.connect(host='localhost', user='******', passwd='', port=3306, charset='utf8') cur = conn.cursor() conn.select_db('bili') cur.execute('INSERT INTO video VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', [str(av), str(av), cid, title, tminfo, time, click, danmu, coins, favourites, duration, mid, name, article, fans, tag1, tag2, tag3, str(common), honor_click, honor_coins, honor_favourites]) print "Succeed: av" + str(av) except MySQLdb.Error, e: print "Mysql Error %d: %s" % (e.args[0], e.args[1]) else: print "Error_Json: " + url else: print "Error_noCid:" + url else: print "Error_404: " + url pool = ThreadPool(10) # results = pool.map(spider, urls) try: results = pool.map(spider, urls) except Exception, e: # print 'ConnectionError' print e time.sleep(300) results = pool.map(spider, urls) pool.close() pool.join()
def proxy(num): # get_proxy(proxy_urls[1]) pool = Pool(num) pool.map(get_proxy, proxy_urls)
content = ''.join(con) #遍历字符串 保存为每行不好过50个字符 for i in range(0,len(content),50): f.write(content[i:i+50] + "\n") def get_content(html): ''' :并行爬取保存数据 ''' urls = [] for con in html: url = con['url'] name = con['title'] urls.append({'name':name,'url':url}) # 线程个数 pool = Pool(4) # 使用map进行并行爬取,save_content为爬取保存函数, # urls为一个list,里面存储的为网址列表和对应的保存名字 pool.map(save_content,urls) pool.close() pool.join() def main(): selector = get_response(chapter_url) html = get_chapter_content(selector)
def get_current_match_details(s, region, champion_id): matchlist = cass.get_match_history(summoner=s, champions=[champion_id], begin_index=0, end_index=10) len(matchlist) # to fill matchlist try: pool = Pool(10) pool.map(load_match, matchlist) pool.close() pool.join() except: pool.close() return HttpResponse(status=500) response = {} q = {} leagues = cass.get_league_positions(summoner=s, region=region) for league in leagues: q[league.queue.value] = { 'tier': league.tier.value, 'division': league.division.value, 'points': league.league_points } if league.promos is not None: q[league.queue.value]['promos'] = league.promos.progress q[league.queue.value]['notPlayed'] = league.promos.not_played # summoner stats for past 20 matches on a champion stats = { "kills": 0, "deaths": 0, "assists": 0, "totalCs": 0, "cs10": 0, "cs20": 0, "cs30": 0, "gold10": 0, "gold20": 0, "gold30": 0, "wins": 0, "losses": 0 } match_history = [] games10 = 0 games20 = 0 games30 = 0 cs10 = 0 cs20 = 0 cs30 = 0 gold10 = 0 gold20 = 0 gold30 = 0 match_count = 0 for match in matchlist: if (match.region.value == region): match_count += 1 participants = match.participants for participant in participants: if participant.summoner.id == s.id: user = participant break stats["kills"] += user.stats.kills stats["deaths"] += user.stats.deaths stats["assists"] += user.stats.assists stats["totalCs"] += user.stats.total_minions_killed if user.stats.win: stats["wins"] += 1 match_history.append(1) else: stats["losses"] += 1 match_history.append(0) dur = match.duration.seconds try: if dur > 10 * 60: gold10 += user.timeline.gold_per_min_deltas['0-10'] * 10 cs10 += user.timeline.creeps_per_min_deltas['0-10'] * 10 games10 += 1 if dur > 20 * 60: gold20 += (user.timeline.gold_per_min_deltas['0-10'] + user.timeline.gold_per_min_deltas['10-20']) * 10 cs20 += (user.timeline.creeps_per_min_deltas['0-10'] + user.timeline.creeps_per_min_deltas['10-20']) * 10 games20 += 1 if dur > 30 * 60: gold30 += (user.timeline.gold_per_min_deltas['0-10'] + user.timeline.gold_per_min_deltas['10-20'] + user.timeline.gold_per_min_deltas['20-30']) * 10 cs30 += (user.timeline.creeps_per_min_deltas['0-10'] + user.timeline.creeps_per_min_deltas['10-20'] + user.timeline.creeps_per_min_deltas['20-30']) * 10 games30 += 1 except: log.warn('user timeline data does not exist', match.id) stats["kills"] /= 10 stats["deaths"] /= 10 stats["assists"] /= 10 stats["totalCs"] /= 10 try: stats["cs10"] = round(cs10 / games10, 2) stats["cs20"] = round(cs20 / games20, 2) stats["cs30"] = round(cs30 / games30, 2) stats["gold10"] = round(gold10 / games10, 2) stats["gold20"] = round(gold20 / games20, 2) stats["gold30"] = round(gold30 / games30, 2) except: # divide by 0 pass build = {} boots = {} core = {} situational = {} all_items = {} # get recommended build if match_count > 0: try: champ_items = ChampionItems.objects.get(champ_id=user.champion.id) items_blob = ujson.loads(champ_items.item_blob) blob_items = items_blob.items() for item, occurence in blob_items: if int(item) in Items.boots: boots[item] = occurence elif int(item) in Items.full_items: all_items[item] = occurence sorted_all = sorted(all_items, key=all_items.get, reverse=True) core_arr = sorted_all[:3] situational_arr = sorted_all[3:8] for item in core_arr: core[item] = all_items[item] for item in situational_arr: situational[item] = all_items[item] except: pass build['boots'] = boots build['core'] = core build['situational'] = situational response['stats'] = stats response['build'] = build response['leagues'] = q return response
parse_dates=True, usecols=['Date', 'Close', '10_MAC', '100_MAC']) result = init_dict() for date in df.index: if gain_lose(df, date): update_result(df, date, result) result_x = sorted(result.items(), key=operator.itemgetter(1), reverse=True) result = [] for row in result_x: if row[1] > THRESH: result.append(row) if len(result) > 0: writer = csv.writer(open(OUTPATH + symbol + '.csv', 'wb')) writer.writerows(result) return if __name__ == '__main__': if os.path.exists('out'): shutil.rmtree('out') os.makedirs('out') files = os.listdir(PATH) pool = ThreadPool(4) pool.map(calculation, files) pool.close() pool.join()
def ping(self, targets=list(), filename=str(), status=str()): """ Attempt to ping a list of hosts or networks (can be a single host) :param targets: List - Name(s) or IP(s) of the host(s). :param filename: String - name of the file containing hosts to ping :param status: String - if one of ['alive', 'dead', 'noip'] then only return results that have that status. If this is not specified, then all results will be returned. :return: Type and results depends on whether status is specified: if status == '': return dict: {targets: results} if status != '': return list: targets if targets == status """ if targets and filename: raise SyntaxError("You must specify only one of either targets=[] " "or filename=''.") elif not targets and not filename: raise SyntaxError("You must specify either a list of targets or " "filename='', but not both.") elif filename: targets = self.read_file(filename) my_targets = {'hosts': [], 'nets': []} addresses = [] # Check for valid networks and add hosts and nets to my_targets for target in targets: # Targets may include networks in the format "network mask", or, # a file could contain multiple hosts or IP's on a single line. if len(target.split()) > 1: target_items = target.split() for item in target_items: try: ip = IPAddress(item) # If it is an IPv4 address or mask put in in addresses if ip.version == 4: addresses.append(str(ip)) except AddrFormatError: # IP Address not detected, so assume it's a host name my_targets['hosts'].append(item) except ValueError: # CIDR network detected net = IPNetwork(item) # Make sure it is a CIDR address acceptable to fping if net.ip.is_unicast() and net.version == 4 and \ net.netmask.netmask_bits() in range(8, 31): my_targets['nets'].append(target_items[0]) else: msg = str(str(net) + ':Only IPv4 unicast addresses' ' with bit masks\n ' ' from 8 to 30 are supported.') raise AttributeError(msg) # Iterate over the IP strings in addresses while len(addresses) > 1: ip = IPAddress(addresses[0]) mask = IPAddress(addresses[1]) # Test to see if IP is unicast, and mask is an actual mask if ip.is_unicast() and mask.is_netmask(): net = IPNetwork(str(ip) + '/' + str( mask.netmask_bits())) # Convert ip and mask to CIDR and remove from addresses my_targets['nets'].append(str(net.cidr)) addresses.pop(0) addresses.pop(0) elif ip.is_unicast() and not ip.is_netmask(): # mask was not a mask so only remove IP and start over my_targets['hosts'].append(str(ip)) addresses.pop(0) # There could be one more item in addresses, so check it if addresses: ip = IPAddress(addresses[0]) if ip.is_unicast() and not ip.is_netmask(): my_targets['hosts'].append(addresses[0]) addresses.pop() # target has only one item, so check it else: try: ip = IPAddress(target) if ip.version == 4 and ip.is_unicast() and \ not ip.is_netmask(): my_targets['hosts'].append(target) else: msg = str(target + 'Only IPv4 unicast addresses are ' 'supported.') raise AttributeError(msg) except AddrFormatError: # IP Address not detected, so assume it's a host name my_targets['hosts'].append(target) except ValueError: # CIDR network detected net = IPNetwork(target) if net.ip.is_unicast() and net.version == 4 and \ net.netmask.netmask_bits() in range(8, 31): my_targets['nets'].append(target) else: msg = str(str(net) + ':Only IPv4 unicast addresses' ' with bit masks\n ' ' from 8 to 30 are supported.') raise AttributeError(msg) """ Build the list of commands to run. """ commands = [] if len(my_targets['hosts']) != 0: for target in range(len(my_targets['hosts'])): commands.append([self.fping, '-nV', my_targets['hosts'][ target]]) if len(my_targets['nets']) != 0: for target in range(len(my_targets['nets'])): commands.append([self.fping, '-ngV', my_targets['nets'][ target]]) """ Start pinging each item in my_targets and return the requested results when done. """ pool = ThreadPool(self.num_pools) raw_results = pool.map(self.get_results, commands) pool.close() pool.join() self.results = {host: result for host, result in csv.reader( ''.join(raw_results).splitlines())} if not status: return self.results elif status == 'alive': return self.alive elif status == 'dead': return self.dead elif status == 'noip': return self.noip else: raise SyntaxError("Valid status options are 'alive', 'dead' or " "'noip'")
def run(self): pool = ThreadPool(4) pool.map(self.retrieve_pic, self.process_multiple()) pool.close() pool.join()
# -*- coding: utf-8 -*- import urllib2 import urllib from multiprocessing.dummy import Pool as ThreadPool from threading import Thread def postHttp(i): name = urllib.quote('我日') url = 'https://hd.ysfaisco.cn/ajax/hdgame_h.jsp?cmd=setMbPlayerVotes&aid=11590276&gameId=1&openId='+i+'&style=49&name='+name+'&playerId=10604&otherPlayerId=6860' header = {} header['User-Agent'] = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 MicroMessenger/5.0.1' request = urllib2.Request(url,headers=header) response = urllib2.urlopen(request) result = response.read() result = result.strip() print result if __name__ == '__main__': openid_list = ["oHRvawPnAW3RIxAElC8EKHtyCl1Y", "oHRvawA-pq-3dL3WkVAwOFM4y4Ss", "oHRvawLEo_hYamjiHGK_QFxr3vRU", "oHRvawBVAuoD1AqeK_ptmU9QqTU0", "oHRvawMGC7M97rHf-IS68odntdY8", "oHRvawE-wN6Ju7Qt9RWBOxWrogck", "oHRvawKHhGo8aNMRGd4HrF75x7bg", "oHRvawDQGSsGGjiCgvJCts0lXmQQ", "oHRvawG1jWXDaC5fNx4GOgroHepk", "oHRvawF-06-oX6yPOh65vWU0ORwM", "oHRvawByQkdkqcQdPjLT9TxXwXSc", "oHRvawDH1o82EfC_LFAt9W-l9olU", "oHRvawPfwBSC59WpDPlxIiGGVb6w", "oHRvawCPx4jbjrPi3jL2etQBMu7M", "oHRvawHgyW3DS8YNs44k0pCLspPk", "oHRvawDifKhMsHnWUiyrX6saAAdA", "oHRvawPyQbRCHNpQkFOn3_XLcc6E", "oHRvawNyKI-CSTeJP5o3G_46_Xww", "oHRvawOHjbl5xCJciA6fHNqmzQMo", "oHRvawEP6seYQw9C47gL6PHSFn0Q", "oHRvawI3summPXydFMLO46axcG-c", "oHRvawHhGSI-KvkagqOsztoxPfLc", "oHRvawCALKFm1_YgwpgrEaJ_Twq8", "oHRvawJwaHE__eV69PpS79-mAqEk", "oHRvawCRcdBEtG_0D6HHTUc6_X6k", "oHRvawOrvvdOjgVD9QSZ0trElJKM", "oHRvawPr0tPe7CTf2DXdaG_CDdv0", "oHRvawACRyrjIwC5THma5KBVhvA4", "oHRvawHsSLmvRslxC9PC6TX0FbPU", "oHRvawFNfDbRJM4DbIx4IDu6cngU", "oHRvawEX-eNOX44o7LT9zPMYJ9j0", "oHRvawPBnASXJnqY-2VVYpmnlMEU", "oHRvawKy-RW-8Q0sLcst5bY2kA_8", "oHRvawMaIy8Q4H0qBb7nP8Yy3olQ", "oHRvawI9btgFpo7S3Jqw31x-UeoQ", "oHRvawPLpqEFli7diR0QXCdbSR20", "oHRvawASK0KTR7venPHHsXbZJCJo", "oHRvawIT3HBMlY5ZH9oT2fjS7_e4", "oHRvawMaoaXJC5AlS_Rh59n9hlb8", "oHRvawODU7wOWORip_38sizUda_I", "oHRvawHfYM0eHTwieeVEOcoCgkXk", "oHRvawAqCBY5LBCL897-xVOGtEgw", "oHRvawHwu3ekeyrC9Y1GKixkNDqQ", "oHRvawJEjULKo711VQM7H_IG0Eo8", "oHRvawA5o_8Olo8tan5AEJv3SyDE", "oHRvawLHh_n-BTu9G-B0XOzi7eyA", "oHRvawFzkPfjbfX9P-sF04EDd4tc", "oHRvawAvcfeCNHHqi42cjkxwGQ_w", "oHRvawKGiht7E0ooJr1k1LowHesA", "oHRvawBEw0-33b8BJGWlzGQAbbGs", "oHRvawJhRz_VlsHK-SQzYcN05BXM", "oHRvawOPPgmNvjlxHkQ5RcIQRYts", "oHRvawFD2TRk1PAxIZ-mWlpjMKdM", "oHRvawMCd3Q_DSxGgbXYLeU76tTI", "oHRvawOxrhLb2O8Nav_Wu_i5B8g4", "oHRvawOLSfQKRn_zNowgdGD45mew", "oHRvawLgXcCrLQUj2qhuN-7EU4nA", "oHRvawPSsBuHNXiiff_R2jJHJ3xU", "oHRvawN0S5-9al2X70oBxjUjqRWY", "oHRvawKKFywMNfNURTfDcq2ec5wE", "oHRvawDAzdiZzZz5wYUSH27esKxU", "oHRvawCAVxf1u3i0rN62Webn4RKg", "oHRvawDW7htenhAyzVItYrZ6Idec", "oHRvawFZtr5qmR_-XXm2Fb8zkbIA", "oHRvawHCVPt-m9snFkzrQ4rQodh0", "oHRvawDYgh1_0snMXVKcaSYOqc6g", "oHRvawLXic5Qt3C6G-TuheoxturQ", "oHRvawLijvw8ILfQAw3utTy7ppOA", "oHRvawH2auViUvHdBRkL6D3bumpU", "oHRvawBLeZ0MGu2-uUUsAyElFCWE", "oHRvawPhr4x3Vp_A0DJ9k_XA9WV4", "oHRvawCqRp91-1_odIlEetc6m9L4", "oHRvawBsZffzjANt7JCR7NKB9hTM", "oHRvawAnKxvCxqLFl_Vjg5NqSxY0", "oHRvawPLEpkc-MoGFaSoEuVBNTyc", "oHRvawFHxKXFBOtKWIpiBf0qyXiA", "oHRvawLkC4KcghW1S950J2k6NDAQ", "oHRvawM9aahiJtRKkTMDvXb2d8sI", "oHRvawJ_1VBkCdXb8XlBdywOoDoU", "oHRvawITcJLg-pjkl1xRIZi-Iz_Q", "oHRvawDVhDGf2t50Qk-qh3Zd8Wgc"] # postHttp() total = len(openid_list) page_pool = ThreadPool(total) page_pool.map_async(postHttp, (openid_list)) page_pool.close() page_pool.join()
url = 'https://www.pearvideo.com/category_5' page_text = requests.get(url=url).text tree = etree.HTML(page_text) li_list = tree.xpath('//ul[@id="listvideoListUl"]/li') url_list = [] for i in li_list: detail_url = "https://www.pearvideo.com/" + i.xpath('./div/a/@href')[0] name = i.xpath('./div/a/div[2]/text()')[0] + '.mp4' detail_page = requests.get(url=detail_url).text ex = 'srcUrl="(.*?)",vdoUrl' video_url = re.findall(ex, detail_page)[0] dic = {'name': name, 'url': video_url} url_list.append(dic) def get_video_data(d): url = d['url'] data = requests.get(url=url).content print(d['name'], "正在下载。。。") with open(d['name'], 'wb') as f: f.write(data) print(d['name'], "下载成功。。。") pool = Pool(4) pool.map(get_video_data, url_list) pool.close() pool.join()
def execute_nodes(self, linker, Runner, manifest, node_dependency_list): adapter = get_adapter(self.config) num_threads = self.config.threads target_name = self.config.target_name text = "Concurrency: {} threads (target='{}')" concurrency_line = text.format(num_threads, target_name) dbt.ui.printer.print_timestamped_line(concurrency_line) dbt.ui.printer.print_timestamped_line("") schemas = list(Runner.get_model_schemas(manifest)) node_runners = self.get_runners(Runner, adapter, node_dependency_list) pool = ThreadPool(num_threads) node_results = [] for node_list in node_dependency_list: runners = self.get_relevant_runners(node_runners, node_list) args_list = [] for runner in runners: args_list.append({'manifest': manifest, 'runner': runner}) try: for result in pool.imap_unordered(self.call_runner, args_list): is_ephemeral = Runner.is_ephemeral_model(result.node) if not is_ephemeral: node_results.append(result) node = CompileResultNode(**result.node) node_id = node.unique_id manifest.nodes[node_id] = node if result.errored: dependents = self.get_dependent(linker, node_id) self._mark_dependent_errors(node_runners, dependents, result, is_ephemeral) except KeyboardInterrupt: pool.close() pool.terminate() adapter = get_adapter(self.config) if not adapter.is_cancelable(): msg = ("The {} adapter does not support query " "cancellation. Some queries may still be " "running!".format(adapter.type())) yellow = dbt.ui.printer.COLOR_FG_YELLOW dbt.ui.printer.print_timestamped_line(msg, yellow) raise for conn_name in adapter.cancel_open_connections(): dbt.ui.printer.print_cancel_line(conn_name) dbt.ui.printer.print_run_end_messages(node_results, early_exit=True) pool.join() raise pool.close() pool.join() return node_results
# from multiprocessing import Pool # import multiprocessing as mp from multiprocessing.dummy import Pool # 虛擬池化 import time import os def longTimeTask(i): print('task: {}, PID: {}'.format(i, os.getpid())) time.sleep(2) result = 10**30 return result if __name__ == '__main__': start_time = time.time() print('母程序PID', os.getpid()) # 觀察PID p = Pool(4) # data is a list catch 每次遞迴的回傳值 data = p.map(longTimeTask, iterable=[2, 4, 6, 8]) # iterable=range(4) 也可用 list 進行疊代 p.close() p.join() print(data) end_time = time.time() print('花了 {} 秒'.format(end_time - start_time))
class RenderLocaleBatch(object): """Handles the rendering and threading of the controllers.""" BATCH_DEFAULT_SIZE = 300 # Default number of documents in a batch. def __init__(self, jinja_env, profile, tick=None, batch_size=None): self.batch_size = batch_size or self.BATCH_DEFAULT_SIZE self.jinja_env = jinja_env self.profile = profile self.tick = tick self.batches = [[]] self._is_rendering = False self._results = None self._thread_pool = None def __len__(self): count = 0 for batch in self.batches: count = count + len(batch) return count def _get_batch(self): # Ensure that batch is not over the max size. batch = self.batches[len(self.batches) - 1] if len(batch) >= self.batch_size: self.batches.append([]) batch = self.batches[len(self.batches) - 1] return batch def add(self, controller, *args, **kwargs): """Add an item to be rendered to the batch.""" batch = self._get_batch() batch.append({ 'controller': controller, 'jinja_env': self.jinja_env, 'args': args, 'kwargs': kwargs, }) def render_start(self): """Start the batches rendering.""" self._thread_pool = ThreadPool(len(self.batches)) self._results = self._thread_pool.imap_unordered( render_func, self.batches) self._is_rendering = True def render_finish(self): """Finish in progress batches rendering.""" if not self._is_rendering: raise RenderNotStartedError('Rendering was never started') render_errors = [] rendered_docs = [] for batch_result in self._results: render_errors = render_errors + batch_result.render_errors rendered_docs = rendered_docs + batch_result.rendered_docs if self.tick: for _ in batch_result.render_errors: self.tick() for _ in batch_result.rendered_docs: self.tick() for result in batch_result.rendered_docs: self.profile.add_timer(result.render_timer) self._thread_pool.close() self._thread_pool.join() self._is_rendering = False return rendered_docs, render_errors def render_sync(self): """Syncronous rendering for non-threaded rendering.""" render_errors = [] rendered_docs = [] for batch in self.batches: batch_result = render_func(batch, tick=self.tick) render_errors = render_errors + batch_result.render_errors rendered_docs = rendered_docs + batch_result.rendered_docs return rendered_docs, render_errors
def bc2pg( dataset, db_url, table, schema, query, bounds, bounds_crs, pagesize, max_workers, dim, fid, append, promote_to_multi, no_timestamp, verbose, quiet, ): """Download a DataBC WFS layer to postgres - an ogr2ogr wrapper. \b $ bcdata bc2pg bc-airports --db_url postgresql://postgres:postgres@localhost:5432/postgis The default target database can be specified by setting the $DATABASE_URL environment variable. https://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls """ # for this command, default to INFO level logging # (echo the ogr2ogr commands by default) verbosity = verbose - quiet log_level = max(10, 20 - 10 * verbosity) logging.basicConfig(stream=sys.stderr, level=log_level) log = logging.getLogger(__name__) src = bcdata.validate_name(dataset) src_schema, src_table = [i.lower() for i in src.split(".")] if not schema: schema = src_schema if not table: table = src_table # always upper if fid: fid = fid.upper() # create schema if it does not exist conn = pgdata.connect(db_url) if schema not in conn.schemas: click.echo("Schema {} does not exist, creating it".format(schema)) conn.create_schema(schema) # if table does not exist already, remove the -append flag if schema + "." + table not in conn.tables and append: append = False click.echo("Table does not exist, creating") # build parameters for each required request param_dicts = bcdata.define_request( dataset, query=query, sortby=fid, pagesize=pagesize, bounds=bounds, bounds_crs=bounds_crs, ) # run the first request / load payload = urlencode(param_dicts[0], doseq=True) url = bcdata.WFS_URL + "?" + payload db = parse_db_url(db_url) db_string = "PG:host={h} user={u} dbname={db} port={port}".format( h=db["host"], u=db["user"], db=db["database"], port=db["port"], ) if db["password"]: db_string = db_string + " password={pwd}".format(pwd=db["password"]) # create the table command = [ "ogr2ogr", "-f", "PostgreSQL", db_string, "-t_srs", "EPSG:3005", "-nln", schema + "." + table, url, ] if append: command = command + ["-append"] else: command = command + ["-overwrite", "-lco", "GEOMETRY_NAME=geom"] if dim: command = command + ["-dim", dim] # for speed with big loads - unlogged, no spatial index if not append: command = command + ["-lco", "UNLOGGED=ON"] command = command + ["-lco", "SPATIAL_INDEX=NONE"] if promote_to_multi: command = command + ["-nlt", "PROMOTE_TO_MULTI"] log.info(" ".join(command)) subprocess.run(command) # write to additional separate tables if data is larger than 10k recs if len(param_dicts) > 1: commands = [] for n, paramdict in enumerate(param_dicts[1:]): # create table to load to (so types are identical) sql = """ CREATE TABLE {schema}.{table}_{n} (LIKE {schema}.{table} INCLUDING ALL) """.format(schema=schema, table=table, n=str(n)) conn.execute(sql) payload = urlencode(paramdict, doseq=True) url = bcdata.WFS_URL + "?" + payload command = [ "ogr2ogr", "-update", "-append", "-f", "PostgreSQL", db_string + " active_schema=" + schema, "-t_srs", "EPSG:3005", "-nln", table + "_" + str(n), url, ] if dim: command = command + ["-dim", dim] if promote_to_multi: command = command + ["-nlt", "PROMOTE_TO_MULTI"] commands.append(command) # log all requests, not just the first one for c in commands: log.info(c) # https://stackoverflow.com/questions/14533458 pool = Pool(max_workers) with click.progressbar(pool.imap(partial(call), commands), length=len(param_dicts)) as bar: for returncode in bar: if returncode != 0: click.echo("Command failed: {}".format(returncode)) # once loaded, combine & drop for n, _x in enumerate(param_dicts[1:]): sql = """INSERT INTO {schema}.{table} SELECT * FROM {schema}.{table}_{n}""".format( schema=schema, table=table, n=str(n)) conn.execute(sql) sql = "DROP TABLE {}.{}_{}".format(schema, table, n) conn.execute(sql) # Deal with primary key # First, drop ogc_fid - becaue we load to many tables, it is not unique sql = "ALTER TABLE {}.{} DROP COLUMN ogc_fid CASCADE".format(schema, table) conn.execute(sql) # if provided with a fid to use as pk, assign it if fid: sql = "ALTER TABLE {}.{} ADD PRIMARY KEY ({})".format( schema, table, fid) conn.execute(sql) # make fid auto-increment in case we want to add records sql = """ CREATE SEQUENCE {schema}.{table}_{fid}_seq OWNED BY {schema}.{table}.{fid}; SELECT setval('{schema}.{table}_{fid}_seq', coalesce(max({fid}), 0) + 1, false) FROM {schema}.{table}; ALTER TABLE {schema}.{table} ALTER COLUMN {fid} SET DEFAULT nextval('{schema}.{table}_{fid}_seq'); """.format(schema=schema, table=table, fid=fid) conn.execute(sql) # otherwise, create a new serial ogc_fid else: sql = "ALTER TABLE {}.{} ADD COLUMN ogc_fid SERIAL PRIMARY KEY".format( schema, table) conn.execute(sql) if not append: conn.execute("ALTER TABLE {}.{} SET LOGGED".format(schema, table)) log.info("Indexing geometry") conn.execute("CREATE INDEX ON {}.{} USING GIST (geom)".format( schema, table)) # once complete, note date/time of completion in public.bcdata if not no_timestamp: conn.execute( "CREATE TABLE IF NOT EXISTS public.bcdata (table_name text PRIMARY KEY, date_downloaded timestamp WITH TIME ZONE);" ) conn.execute( """INSERT INTO public.bcdata (table_name, date_downloaded) SELECT %s as table_name, NOW() as date_downloaded ON CONFLICT (table_name) DO UPDATE SET date_downloaded = NOW(); """, (schema + '.' + table, )) log.info("Load of {} to {} in {} complete".format(src, schema + "." + table, db_url))
def mergeFilesByRegion(filesByRegion, grid, outputDir): # Merge a set of files by region into the specified dir # Key is up/down/nominal etc N = 0 filesToWrite = {} for r in filesByRegion: for key in filesByRegion[r]: if filesByRegion[r][key] == []: if key == "Nominal": print("WARNING: no input files for region {0} key {1}". format(r, key)) continue filePrefix = "%s_%s" % (r, grid) filename = os.path.join(outputDir, "%s.root" % (filePrefix)) if os.path.exists(filename): print("Output file {0} exists - skipping".format( os.path.basename(filename))) continue filesToWrite[filename] = { "region": r, "files": filesByRegion[r][key] } N += 1 # Got anything? if filesToWrite == {}: return # build the pool arguments args = [] for filename in filesToWrite: N -= 1 args.append(( filename, filesToWrite[filename]['files'], False, filesToWrite[filename]['region'], N, )) pool = ThreadPool(8) try: #results = pool.map(mergeFiles, args) results = pool.imap_unordered(mergeFiles, args) pool.close() pool.join() except KeyboardInterrupt: print "Caught KeyboardInterrupt, terminating workers" pool.terminate() pool.join() return # Below is to be removed legacy code relying on hadd for r in filesByRegion: for key in filesByRegion[r]: if filesByRegion[r][key] == []: continue N -= 1 # Merge the files in chunks of 50, and then merge these chunks # The whole idea behind this exercise is to avoid exceeding the maximum length of # of a command allowed in bash. filePrefix = "%s_%s" % (r, grid) filename = os.path.join(outputDir, "%s.root" % (filePrefix)) if os.path.exists(filename): print("Output file {0} exists - skipping".format( os.path.basename(filename))) continue mergeFiles(filename, filesByRegion[r][key]) #fileMerger = ROOT.TFileMerger() #fileMerger.OutputFile(filename) #for f in filesByRegion[r][key]: # fileMerger.AddFile(f) #fileMerger.Merge() #i=1 #print("Attempting to make file {0}".format(filename)) #for subset in chunks(filesByRegion[r][key], 50): # print("Merging subset {0:d}...".format(i)) # filename = os.path.join(outputDir, "%s_%03d.root" % (filePrefix, i) ) # outputFiles.append(filename) # # if len(subset) == 1: # shutil.copy(subset[0], filename) # else: # cmd = "hadd -f %s %s" % (filename, " ".join(subset)) # subprocess.call(cmd, shell=True) # # i+=1 #print("Merging all subsets") #filename = os.path.join(outputDir, "%s.root" % (filePrefix) ) #if len(outputFiles) == 1: # # only 1 file, so just rename it # os.rename(outputFiles[0], filename) #else: # cmd = "hadd -f %s %s" % (filename, " ".join(outputFiles)) # subprocess.call(cmd, shell=True) #print("Done merging subsets; removing temporary files") #for f in outputFiles: # if not os.path.exists(f): continue # os.remove(f) print("=> Created file for {0}; {1} files remaining".format(r, N))
def render_start(self): """Start the batches rendering.""" self._thread_pool = ThreadPool(len(self.batches)) self._results = self._thread_pool.imap_unordered( render_func, self.batches) self._is_rendering = True
def main(): expected_arg = "[A valid PAGE_URL or IMAGE_URL]" num_args = len(sys.argv) if num_args < 2 or num_args > 2: print("\n* INVALID RUN COMMAND! * Usage:") print("classify %s\n" % expected_arg) elif num_args == 2: url = sys.argv[1] valid_url = web_core.is_valid_url(url) if not valid_url: file_path = url if isfile(file_path): best_guess = image_base.classify_local_image(file_path) elif isdir(file_path): best_guess = image_base.classify_folder_images( file_path, return_dict=True) else: raise Exception("Error: %s is not a valid image path!" % url) print("\n*** Best match classification: ***") print(best_guess) print("") return content_type = web_core.get_content_type(url) if content_type == 'other': raise Exception( "Error: %s does not evaluate to %s" % (url, expected_arg)) elif content_type == 'image': best_guess = image_base.classify_image_url(url) print("\n*** Best match classification: ***") print(best_guess) print("") elif content_type == 'html': global images_classified image_list = image_base.get_all_images_on_page(url) if 'linux2' not in sys.platform and settings.MAX_THREADS > 1: # Multi-threading the work when not using Docker Linux pool = ThreadPool(settings.MAX_THREADS) pool.map(download_and_classify_image, image_list) pool.close() pool.join() else: # Single-threading the image classification work min_w_h = settings.MIN_W_H # Minimum size for classification for image in image_list: web_core.save_file_as(image, "temp_image.png") image_base.convert_image_file_to_jpg( "downloads_folder/temp_image.png") width, height = image_base.get_image_file_dimensions( "downloads_folder/temp_image.jpg") if width >= min_w_h and height >= min_w_h: best_guess = classify_image.external_run( "downloads_folder/temp_image.jpg") if images_classified == 0: print("\n*** " "Best match classifications for page images:" " ***") images_classified += 1 print(best_guess) if images_classified >= settings.MAX_IMAGES_PER_PAGE: break if images_classified >= settings.MAX_IMAGES_PER_PAGE: print("\n(NOTE: Exceeded page classification limit " "of %d images per URL! Stopping early.)" % ( settings.MAX_IMAGES_PER_PAGE)) if images_classified == 0: print("\nCould not find images to classify on the page! " "(Min size = %dx%d pixels)" % ( settings.MIN_W_H, settings.MIN_W_H)) print("") else: raise Exception( "Unexpected content type %s. Fix the code!" % content_type)
self.urls.append(make_url(self.domain, url)) urls = [] def test(url): print("URLS : ", len(urls)) ax = Spider(url) urls.extend(ax.urls) return url if __name__ == '__main__': content = "" url = input('Enter the url : ') urls.append(url) spider = Spider(url) print("Will visit :" + str(len(spider.urls)) + " urls") pool = ThreadPool(10) results = pool.map(test, spider.urls) """for i in spider.urls: print("URLS : ", len(urls)) ax = Spider(i) urls.extend(ax.urls) print(len(urls))""" #spider.find_urls(content) #print(content)
authwarpper = partial(auth_account, ) def spider_kaiyuan(start, end, password): file_name = '账号{0}-{1}.txt'.format(start, end) with open(file_name, 'a+') as f: for account in range(int(start), int(end)): if (auth_account(account=account, password=password)): f.write('账号:{} 密码{}'.format(account, password)) f.flush() def generate_func_args(start, end, password): return (start, end, password) if __name__ == '__main__': pool = ThreadPool(5) args = [] account_start = 888800000000 password = input("输入测试密码 测试区间在{} -{}\n".format( account_start + 5 * 10000, account_start + 9 * 10000 + 9999)) for i in range(5, 9): args_map = generate_func_args(account_start + i * 10000, account_start + i * 10000 + 9999, password) args.append(args_map) pool.starmap(spider_kaiyuan, args)
def trigger_tasks(tasks, thread_count): pool = ThreadPool(int(thread_count)) pool.map(Task.run_tests, tasks)
def paths_to_bids(path_to_dataset, path_to_csv, bids_dir, modality): """ This method converts all the T1 images found in the AIBL dataset downloaded in BIDS :param path_to_dataset: path_to_dataset :param path_to_csv: path to the csv file containing clinical data :param bids_dir: path to save the AIBL-T1-dataset converted in a BIDS format :param modality: string 't1', 'av45', 'flute' or 'pib' :return: list of all the images that are potentially converted in a BIDS format and saved in the bids_dir. This does not guarantee existence """ from os.path import join, exists from numpy import nan import pandas as pds from clinica.utils.stream import cprint from multiprocessing.dummy import Pool from multiprocessing import cpu_count, Value import glob if modality.lower() not in ['t1', 'av45', 'flute', 'pib']: # This should never be reached raise RuntimeError(modality.lower() + ' is not supported for conversion') counter = None def init(args): """ store the counter for later use """ global counter counter = args def create_file(image): global counter subject = image.Subjects_ID session = image.Session_ID name_of_path = { 't1': 'Path_to_T1', 'av45': 'Path_to_pet', 'flute': 'Path_to_pet', 'pib': 'Path_to_pet' } # depending on the dataframe, there is different way of accessing # the iage object image_path = image[name_of_path[modality]] with counter.get_lock(): counter.value += 1 if image_path is nan: cprint('No path specified for ' + subject + ' in session ' + session) return nan cprint('[' + modality.upper() + '] Processing subject ' + str(subject) + ' - session ' + session + ', ' + str(counter.value) + ' / ' + str(total)) session = viscode_to_session(session) # creation of the path if modality == 't1': output_path = join(bids_dir, 'sub-AIBL' + subject, 'ses-' + session, 'anat') output_filename = 'sub-AIBL' + subject + '_ses-' + session + '_T1w' elif modality in ['flute', 'pib', 'av45']: output_path = join(bids_dir, 'sub-AIBL' + subject, 'ses-' + session, 'pet') output_filename = 'sub-AIBL' + subject + '_ses-' + session \ + '_task-rest_acq-' + modality + '_pet' # image is saved following BIDS specifications if exists(join(output_path, output_filename + '.nii.gz')): cprint('Subject ' + str(subject) + ' - session ' + session + ' already processed.') output_image = join(output_path, output_filename + '.nii.gz') else: output_image = dicom_to_nii(subject, output_path, output_filename, image_path) return output_image # it reads the dataframe where subject_ID, session_ID and path are saved if modality == 't1': images = find_path_to_T1(path_to_dataset, path_to_csv) else: path_to_csv_pet_modality = glob.glob( join(path_to_csv, 'aibl_' + modality + 'meta_*.csv'))[0] if not exists(path_to_csv_pet_modality): raise FileNotFoundError(path_to_csv_pet_modality + ' file not found in clinical data folder') # Latest version of Flutemetamol CSV file (aibl_flutemeta_01-Jun-2018.csv) # has an extra column for some rows. However, each CSV file (regarding PET tracers) # contains the same columns. The usecols fixes this issue. df_pet = pds.read_csv(path_to_csv_pet_modality, sep=',|;', usecols=list(range(0, 36))) images = find_path_to_pet_modality(path_to_dataset, df_pet) images.to_csv(join(bids_dir, modality + '_paths_aibl.tsv'), index=False, sep='\t', encoding='utf-8') counter = Value('i', 0) total = images.shape[0] # Reshape inputs to give it as a list to the workers images_list = [] for i in range(total): images_list.append(images.iloc[i]) # intializer are used with the counter variable to keep track of how many # files have been processed poolrunner = Pool(cpu_count(), initializer=init, initargs=(counter, )) output_file_treated = poolrunner.map(create_file, images_list) del counter return output_file_treated
def initiate_threads(): _pool = Pool(5) _pool.map(traverse_directory, self.valid_directories) _pool.close() _pool.join()