def save_imgs(): pool = workerpool.WorkerPool(size=10) for map_name in maps: i_range = maps[map_name][0] j_range = maps[map_name][1] make_dir(map_name) for feature in features: make_dir(map_name, feature) for i in range(i_range): make_dir(map_name, feature, i) for j in range(j_range): loc_url = url_maker(map_name, feature, i, j) save_to = store_location_maker(map_name, feature, i, j) job = DownloadJob(loc_url, save_to) pool.put(job) pool.shutdown() pool.wait() # proofread pool = workerpool.WorkerPool(size=10) for map_name in maps: i_range = maps[map_name][0] j_range = maps[map_name][1] make_dir(map_name) for feature in features: make_dir(map_name, feature) for i in range(i_range): make_dir(map_name, feature, i) for j in range(j_range): loc_url = url_maker(map_name, feature, i, j) save_to = store_location_maker(map_name, feature, i, j) if not os.path.exists(save_to): job = DownloadJob(loc_url, save_to) pool.put(job) pool.shutdown() pool.wait()
def get_layers(): pool = workerpool.WorkerPool(size=10) for map_name in maps: i_range = maps[map_name][0] j_range = maps[map_name][1] # (0, 24, 28, 37, 45) for i in range(i_range): for j in range(j_range): job = GetterJob(map_name, i, j) pool.put(job) pool.shutdown() pool.wait() # proofread due to exceptions in concurrency pool = workerpool.WorkerPool(size=10) for map_name in maps: i_range = maps[map_name][0] j_range = maps[map_name][1] for i in range(i_range): for j in range(j_range): store_loc = store_location_maker(map_name, "merged", i, j) if not os.path.exists(store_loc): job = GetterJob(map_name, i, j) pool.put(job) pool.shutdown() pool.wait()
def getChannels(url): if __settings__.getSetting('paid_account') == "true": while not login(): xbmc.executebuiltin( "XBMC.Notification('GLArab','INVALID username and/or password.',30000," + icon + ")") __settings__.openSettings() url += '&type=reg' else: url += '&type=free' resp = opener.open(url) inner_data = resp.read() inner_soup = BeautifulSoup(inner_data) container = inner_soup.find('div', id='listContainerScroll') thumbnail = "DefaultVideo.png" pattern = re.compile("\makeHttpRequest\(\'(.*?)\&\',") NUM_SOCKETS = 5 NUM_WORKERS = 8 http = urllib3.PoolManager(maxsize=NUM_SOCKETS) workers = workerpool.WorkerPool(size=NUM_WORKERS) for span in container: workers.put(FetchJob(span, pattern, http)) workers.shutdown() workers.wait()
def downloadAndExtract(self): """Downloads and extracts Medline data.""" # Extract topics and articles url mLE = MedlineLinkExtractor() mLE.readXml(self.__topicsXML_path) topics = mLE.getTopics() articles = mLE.getArticles() topicCount = 0 articleCount = 0 # Downloader pool = workerpool.WorkerPool(size=20) for url in topics: topicCount = topicCount + 1 job = MedlineDataExtractor(url, self.__topicPath, isArticle=False) pool.put(job) for url in articles: articleCount = articleCount + 1 job = MedlineDataExtractor(url, self.__articlePath, isArticle=True) pool.put(job) pool.shutdown() pool.wait() print str(topicCount) + ' Topics downloaded' print str(articleCount) + ' Articles downloaded' print 'Run medlineIndexer.py file to build index of topics/aricles'
def Do_MultiThread(dirlist, jobid): global thread_num # Initialize a pool, 5 threads in this case pool = workerpool.WorkerPool(size=thread_num, maxjobs=thread_num) cnt = 0 runcnt = 0 total = len(dirlist) print 'Enter Do_MultiThread' for filename in dirlist: # thread.start_new_thread(Get_OneSpurtData,(filename,)) # print 'thread start ' try: try: job = DoOneJob(filename, jobid, runcnt) print 'Job', runcnt, '/', total pool.put(job) runcnt += 1 time.sleep(0.5) if debug_maxcnt>0 and runcnt >= debug_maxcnt: break except: print 'get error' break except: break # Send shutdown jobs to all threads, and wait until all the jobs have been completed # time.sleep(10) pool.shutdown() pool.wait()
def test_map(self): "Map a list to a method to a pool of two workers." pool = workerpool.WorkerPool(2) r = pool.map(self.double, [1, 2, 3, 4, 5]) self.assertEquals(set(r), {2, 4, 6, 8, 10}) pool.shutdown()
def download_songs(self): """ Downloads the songs passed upon object creation into a folder. The Downloads are mp3 files and are names according to their artist and title :return: A list of dictionaries representing any songs that failed to download, with each dictionary containing the information of a song (like songs). """ # make subfolder for this set of downloads try: os.mkdir(self.download_path) print("Creating download directory for " + self.folder_name + "...") except FileExistsError: print("Download folder already exists for " + self.folder_name + "...") os.chdir(self.download_path) self._remove_existing_songs_from_list() # no real benefit after ~10 threads since limited by download speeds pool = workerpool.WorkerPool(size=10) for song in self.requested_songs: job = DownloadJob(self, song) pool.put(job) pool.shutdown() pool.wait() return [self.num_existing_songs, self.failed_downloaded_songs]
def do_multithread(cmdlines): global thread_num # print thread_num # Initialize a pool, 5 threads in this case pool = workerpool.WorkerPool(size=thread_num, maxjobs=4) cnt = 0 runcnt = 0 for line in cmdlines: try: cnt += 1 print 'Job', cnt try: job = DoOneJob(line, cnt) pool.put(job) runcnt += 1 # print runcnt time.sleep(0.1) except: print 'get error' break except: break # Send shutdown jobs to all threads, and wait until all the jobs have been completed pool.shutdown() pool.wait()
def test_equipped(self): """ Created equipped worker that will use an internal Counter resource to keep track of the job count. """ results = Queue() def toolbox_factory(): return Counter() def worker_factory(job_queue): return workerpool.EquippedWorker(job_queue, toolbox_factory) pool = workerpool.WorkerPool(1, worker_factory=worker_factory) # Run 10 jobs for i in xrange(10): j = CountJob(results) pool.put(j) # Get 10 results for i in xrange(10): r = results.get() # Each result should be an incremented value self.assertEquals(r, i) pool.shutdown()
def Do_MultiThread(cmds): global thread_num pool = workerpool.WorkerPool(size=thread_num, maxjobs=4) cnt = 0 runcnt = 0 for line in cmds: try: cnt += 1 print 'Job', cnt, try: job = DoOneJob(line, cnt) pool.put(job) runcnt += 1 # print runcnt # time.sleep(0.1) # break except: print 'get error' break except: break # Send shutdown jobs to all threads, and wait until all the jobs have been completed pool.shutdown() pool.wait()
def run(self): self._print_header() signaler = Signaler() bitbucket = Bitbucket(self._bb_username, self._bb_password) git = Git() def toolbox_factory(): s3 = S3(self._s3_key, self._s3_secret, self._s3_bucket, self._s3_base_path, self._s3_endpoint) return BitbackupWorkerToolbox(bitbucket, git, s3) def worker_factory(job_queue): worker = workerpool.EquippedWorker(job_queue, toolbox_factory) worker.setName(worker.getName().replace("Thread", "Worker")) return worker info('Loading repository list...') repos = bitbucket.get_all_repositories() info('Starting {} workers...'.format(self._worker_count)) pool = workerpool.WorkerPool(size=self._worker_count, worker_factory=worker_factory, maxjobs=1) for repo in repos: if signaler.should_term(): break pool.put(BitbackupJob(repo)) pool.shutdown() pool.wait() self._print_footer()
def mass_download(urls, nthread): print('Downloading...') pool = workerpool.WorkerPool(size=nthread) saveto = [os.path.basename(url) for url in urls] pool.map(download, urls, saveto) pool.shutdown() pool.wait()
def test_wait(self): "Make sure each task gets marked as done so pool.wait() works." pool = workerpool.WorkerPool(5) q = Queue() for i in xrange(100): pool.put(workerpool.SimpleJob(q, sum, [range(5)])) pool.wait() pool.shutdown()
def install_multi_sw(self): pool = workerpool.WorkerPool(size = self.workerpool_size) for num in range(self.num_vm): self.hostname = self.base_role_name + str(num) + ".cloudapp.net" job = InstallSW(self.hostname, self.username, self.password) pool.put(job) pool.shutdown() pool.wait()
def cachedprime(urls, headers={}, resize=False, plex_resize=False): ''' {'hash': '1dad1d1', fp': 'filepath', 'url': 'imgurl', 'resize': [[w, h, url, dest]} ''' logger.debug('Got %s images' % len(urls)) urls = remove_dict_dupe_from_list(urls, 'hash') logger.debug('Removed all dupicate images got %s left' % len(urls)) imgdir = os.path.join(htpc.DATADIR, 'images/') made_dir = False if not os.path.exists(imgdir): logger.debug('Creating image directory at %s' % imgdir) os.makedirs(imgdir) made_dir = True resize_list = [] logger.debug('This can take a while..') # If there is no local copy of the original if made_dir is True: logger.debug('There was no image directory, so everything is missing') resize_list = urls else: logger.debug('Checking for missing images') # cba with resizes for plex for item in urls: if not os.path.isfile(item['fp']): logger.debug('%s was missing, download it %s' % (item['fp'], item['url'])) resize_list.append(item) if made_dir is False and resize_list == 0: logger.debug('No missing images :)') return pool = workerpool.WorkerPool(size=20) for i in resize_list: j = CacheImgDownload(i, headers) pool.put(j) pool.shutdown() pool.wait() # use pil to resize images if resize_list and plex_resize is False and resize is True: from multiprocessing import Pool, cpu_count ppool = Pool(cpu_count()) try: ppool.map_async(cache_resize_image, (b for b in resize_list), 5) ppool.close() ppool.join() except Exception as e: logger.debug('Failed to resize image %s' % e) else: # Already downloaded transcoded images return
def download_batch(self, urlObjects): pool = workerpool.WorkerPool(min(self.max_pool_size, len(urlObjects))) errors = pool.map(self.download_image, urlObjects) pool.shutdown() pool.wait() errors = filter((lambda x: x),errors) print("Number of images sent for download "+str(len(urlObjects))) print("Number of images that failed "+str(len(errors))) return errors
def delete_multi_vm(self): pool = workerpool.WorkerPool(size=self.workerpool_size) for num in range(self.num_vm): service_name = self.base_role_name + str(num) deployment_name = self.base_deployment_name + str(num) role_name = self.base_role_name + str(num) job = DeleteVMJob(service_name, deployment_name, role_name) pool.put(job) pool.shutdown() pool.wait()
def main(nombres, language, dest_dir, namespaces_path, test_limit=None, pool_size=20): # fix namespaces in to3dirs module so we can use it in this stage to3dirs.namespaces = to3dirs.Namespaces(namespaces_path) test_limit = int(test_limit) if test_limit else None pool = workerpool.WorkerPool(size=int(pool_size)) data_urls = URLAlizer(nombres, dest_dir, language, test_limit) board = StatusBoard(language) yield pool.start(board.process, data_urls) print # final new line for console aesthetic
def install_multi_sw(self): pool = workerpool.WorkerPool(size=self.workerpool_size) for num in range(self.num_vm): self.hostname = self.service_name + ".cloudapp.net" self.ssh_endpoint = util.ssh_endpoint(num) job = InstallSW(self.hostname, self.ssh_endpoint, self.username, self.password) pool.put(job) pool.shutdown() pool.wait()
def __init__(self, threadNum, crawlerData, window): threading.Thread.__init__(self) self.threadNum = threadNum self.crawlerData = crawlerData self.window = window self.timeToQuit = threading.Event() self.timeToQuit.clear() #threadNum: 1 self.pool = workerpool.WorkerPool(1) self.count = 1
def create_multi_vm(self): pool = workerpool.WorkerPool(size=self.workerpool_size) for num in range(self.num_vm): service_name = self.service_name deployment_name = self.deployment_name role_name = self.base_role_name + str(num) job = CreateVMJob(service_name, deployment_name, role_name, num) job.prepare() pool.put(job) pool.shutdown() pool.wait()
def _get_pool(self): "Get a worker pool (cached)" if self.pool: return self.pool def toolbox_factory(): return S3ToolBox(self.aws_key, self.aws_secret_key, self.secure) def worker_factory(job_queue): return workerpool.EquippedWorker(job_queue, toolbox_factory) log.info("Starting pool with %d threads." % self.numthreads) self.pool = workerpool.WorkerPool(self.numthreads, maxjobs=self.maxjobs, worker_factory=worker_factory) return self.pool
def pool(job, params): print 'Initializing...' # Build our `map` parameters # Initialize a pool, 5 threads in this case pool = workerpool.WorkerPool(size=32) # The ``download`` method will be called with a line from the second # parameter for each job. pool.map(job, params) # Send shutdown jobs to all threads, and wait until all the jobs have been completed pool.shutdown() pool.wait() print 'job well done!'
def GetAllComment(aid, order=None): """ 获取一个视频全部评论,有可能需要多次爬取,所以会有较大耗时 输入: aid:AV号 order:排序方式 默认按发布时间倒序 可选:good 按点赞人数排序 hot 按热门回复排序 返回: 评论列表 """ MaxPageSize = 300 commentLists = [ GetComment(aid=aid, page=1, pagesize=MaxPageSize, order=order) ] totalPage = commentLists[0].page directory = 'av' + str(aid) + 'Comments' if not os.path.exists(directory): os.makedirs(directory) if totalPage > 1: #urls = ['http://api.bilibili.cn/feedback?aid=' + str(aid) + '&page=' + str(p) + '&pagesize=' + str(MaxPageSize) for p in range(2, commentList.page + 1)] # Make a pool #pool = workerpool.WorkerPool(size = 10) pool = workerpool.WorkerPool(size=totalPage - 1) # Build our `map` parameters #saveto = [directory + '/' + str(x) for x in range(2, commentList.page + 1)] # Perform the mapping #pool.map(urllib.urlretrieve, urls, saveto) commentLists2 = pool.map(GetComment, [aid] * (totalPage - 1), range(2, totalPage + 1), [MaxPageSize] * (totalPage - 1)) # Send shutdown jobs to all threads, and wait until all the jobs have been completed pool.shutdown() pool.wait() for cl in commentLists2: commentLists.append(cl) ''' commentList.comments += [Comment()] * (commentList.page - 1) for p in range(2, commentList.page + 1): commentPath = directory + '/' + str(p) commentList.comments[(p - 1) * MaxPageSize :] = GetCommentLocal(commentPath, MaxPageSize).comments os.remove(commentPath) os.rmdir(directory) #经测试发现,如果视频评论涨幅过快(av2816940) #那么JSON第一层的totalResult和pages可能不准 #即使每抓一页都重新读取totalResult也无济于事 while commentList.comments[len(commentList.comments) - 1].lv == None: commentList.comments.pop() commentList.commentLen = len(commentList.comments)''' for cl in commentLists: while cl.comments[len(cl.comments) - 1].lv == None: cl.comments.pop() commentLists = sorted(commentLists, key=GetCommentListKey, reverse=True) return commentLists
def get_AllQMdata(): reader = csv.reader(file(listfile_sh, 'rb')) i = 0 pool = workerpool.WorkerPool(size=thread_num, maxjobs=thread_num) for row in reader: print row Get_OneQMData(pool, data_path, data_ext, row[0].upper(), row[1]) reader = csv.reader(file(listfile_sz, 'rb')) for row in reader: print row Get_OneQMData(pool, data_path, data_ext, row[0].upper(), row[1]) pool.shutdown() pool.wait()
def main(): urls = createUrlList(105) pool = workerpool.WorkerPool(size=50) for url in urls: job = DownloadJob(url.strip()) pool.put(job) pool.shutdown() pool.wait() allnames = getNames(allcontents) counted = countSameNames(allnames) result = sortAndOutput(counted)
def fetch_all(charset, character_count=None, thread_count=5): """ Fetch all images of characters in character set GB2312 or GBK from http://www.chineseetymology.org/ Keyword arguments: charset -- the character set in used; should be 'GB2312' or 'GBK' (case insensitive) character_count -- number of characters to fetch thread_count -- number of threading for downloading """ if character_count is None or (character_count is not None and character_count > 0): charset = charset.lower() if charset == "gb2312": characters = _get_gb2312_characters() elif charset == "gbk": characters = _get_gbk_characters() # elif charset == "gb18030": # characters = get_gb18030_2005_characters() else: print("Only \"GB2312\" and \"GBK\" are accepted") return if character_count is not None: characters = itertools.islice(characters, character_count) save_to_folder = charset if not os.path.exists(save_to_folder): os.mkdir(save_to_folder) not_analyzed_file_name = os.path.join(save_to_folder, "not_analyzed.txt") not_found_file_name = os.path.join(save_to_folder, "not_found.txt") if os.path.exists(not_analyzed_file_name): os.remove(not_analyzed_file_name) if os.path.exists(not_found_file_name): os.remove(not_found_file_name) not_found = dict() pool = workerpool.WorkerPool(size=thread_count) pool.map(_fetch_img_of_character, characters, itertools.repeat(save_to_folder), itertools.repeat(not_found)) pool.shutdown() pool.wait() _remove_empty_characters(save_to_folder, not_analyzed_file_name) _write_not_found(not_found_file_name, not_found)
def cli(ctx, opt_fp_in, opt_excludes, opt_dir_out, opt_s3_url, opt_nthreads): """Download only the class hierarcy from VCAT API""" log = logger_utils.Logger.getLogger() log.debug('download images') if not opt_s3_url: log.error('S3 URL required. Try source env variables') return # get the ordered hierarchy vcat_data = vcat_utils.load_annotations(opt_fp_in, opt_excludes) hierarchy_tree = vcat_utils.hierarchy_tree(vcat_data['hierarchy'].copy()) # build image ID lookup table. the regions refer to these image_lookup = {} for vcat_class_id, object_class in vcat_data['object_classes'].items(): for image in object_class['images']: image_lookup[image['id']] = image url_maps = [] for vcat_class_id, object_class in vcat_data['object_classes'].items(): for region in object_class['regions']: im_meta = image_lookup[region['image']] url = vcat_utils.format_im_url(opt_s3_url, im_meta) # log.info(url) fp_out = vcat_utils.format_im_fn(im_meta) fp_out = join(opt_dir_out, fp_out) url_maps.append({'url': url, 'fp_out': fp_out}) if not Path(opt_dir_out).exists(): file_utils.mkdirs(opt_dir_out) # download pool global pbar pbar = tqdm(total=len(url_maps)) pool = workerpool.WorkerPool(size=opt_nthreads) pool.map(downloader, url_maps) # Send shutdown jobs to all threads # and wait until all the jobs have been completed pool.shutdown() pool.wait() pbar.close()
def execute_new_lang(logger): """ Execute new_lang.exe [ make nlp result file ] :param logger: Logger """ global DELETE_FILE_LIST logger.info("4. execute new lang") start = 0 end = 0 cmd_list = list() os.chdir(TA_CONFIG['ta_bin_path']) target_list = glob.glob("{0}/txt/*".format(TA_TEMP_DIR_PATH)) thread = len(target_list) if len(target_list) < int(TA_CONFIG['nl_thread']) else int(TA_CONFIG['nl_thread']) output_dir_list = ['JSON', 'JSON2', 'HMD', 'MCNT', 'NCNT', 'IDX', 'IDXVP', 'W2V'] for dir_name in output_dir_list: output_dir_path = "{0}/{1}".format(TA_TEMP_DIR_PATH, dir_name) if not os.path.exists(output_dir_path): os.makedirs(output_dir_path) temp_new_lang_dir_path = '{0}/{1}'.format(TA_CONFIG['ta_bin_path'], OUTPUT_DIR_NAME) DELETE_FILE_LIST.append(temp_new_lang_dir_path) if not os.path.exists(temp_new_lang_dir_path): os.makedirs(temp_new_lang_dir_path) # Make list file for cnt in range(thread): end += len(target_list) / thread if (len(target_list) % thread) > cnt: end += 1 list_file_path = "{0}/{1}_{2}.list".format(temp_new_lang_dir_path, OUTPUT_DIR_NAME, cnt) list_file = open(list_file_path, 'w') for idx in range(start, end): print >> list_file, target_list[idx] list_file.close() start = end cmd = "./new_lang.exe -DJ {0} txt {1}".format(list_file_path, DT[:8]) logger.debug("new_lang.exe cmd => {0}".format(cmd)) cmd_list.append(cmd) pool = workerpool.WorkerPool(thread) pool.map(pool_sub_process, cmd_list) pool.shutdown() pool.wait()
def execute_hmd(logger, matrix_file_path): """ Execute HMD :param logger: Logger :param matrix_file_path: Matrix file path :return: HMD output directory path """ global DELETE_FILE_LIST logger.info("5. Execute HMD") os.chdir(IE_TA_CONFIG['hmd_script_path']) hmd_file_list = glob.glob("{0}/*".format(NLP_DIR_PATH)) hmd_output_dir_path = "{0}/HMD_result".format(TA_TEMP_DIR_PATH) if not os.path.exists(hmd_output_dir_path): os.makedirs(hmd_output_dir_path) start = 0 end = 0 cmd_list = list() hmd_thread = int(IE_TA_CONFIG['hmd_thread']) thread = len(hmd_file_list) if len(hmd_file_list) < hmd_thread else hmd_thread # Make list file for cnt in range(thread): end += len(hmd_file_list) / thread if (len(hmd_file_list) % thread) > cnt: end += 1 list_file_path = "{0}/{1}_{2}.list".format(IE_TA_CONFIG['hmd_script_path'], TA_TEMP_DIR_NAME, cnt) DELETE_FILE_LIST.append(list_file_path) list_file = open(list_file_path, 'w') for idx in range(start, end): print >> list_file, hmd_file_list[idx] list_file.close() start = end cmd = "python {0}/hmd.py {1} {2} {3} {4}".format( IE_TA_CONFIG['hmd_script_path'], TA_TEMP_DIR_NAME, list_file_path, matrix_file_path, hmd_output_dir_path) cmd_list.append(cmd) pool = workerpool.WorkerPool(thread) pool.map(pool_sub_process, cmd_list) pool.shutdown() pool.wait() return hmd_output_dir_path