def load(self, job_path, deployment_type, listener_port, elb_name_suffix, cf_params): env = cf_params['env'] region = cf_params['region'] cluster = cf_params['cluster'] if 'cluster' in cf_params else 'lambda' has_ecs_service = False if deployment_type != 'ecs_service' else True filenames = [] for subdir, dirs, files in os.walk(job_path): for fn in files: filenames.append(os.path.join(subdir, fn)) pool = ThreadPool(32) pool.imap(self.process_cf_file, ((cf_params, cluster, elb_name_suffix, env, fn, has_ecs_service, listener_port, region) for idx, fn in enumerate(filenames)), chunksize=1) pool.close() pool.join() logging.info("Completed update of %s" % job_path) contains_failure = False while not self.q.empty(): job_result = self.q.get() logging.info(job_result) if "Failed" in job_result: contains_failure = True if contains_failure: logging.error("One or more CF stacks failed!") sys.exit(1) else: logging.info("All CF stacks deployed successfully!")
def test_synchronize(self): demo = LockDemo() pool = ThreadPool(2) pool.imap(demo.bar, range(2)) sleep(0.04) assert_that(demo.call_count, equal_to(1)) sleep(0.05) assert_that(demo.call_count, equal_to(2))
def find_suppliers_with_details(client, framework_slug): pool = ThreadPool(30) records = find_suppliers(client, framework_slug) records = pool.imap(partial(add_supplier_info, client), records) records = pool.imap(partial(add_framework_info, client, framework_slug), records) records = pool.imap(partial(add_submitted_draft_counts, client, framework_slug), records) return get_csv_rows(records, framework_slug)
def test_synchronize_with_same_param(self): demo = LockDemo() pool = ThreadPool(3) pool.imap(demo.foo2, (1, 1)) pool.apply_async(demo.foo1) sleep(0.04) assert_that(demo.call_count, equal_to(1)) sleep(0.05) assert_that(demo.call_count, equal_to(2)) sleep(0.05) assert_that(demo.call_count, equal_to(3))
def find_all_labs(client): pool = ThreadPool(20) records = find_suppliers(client, FRAMEWORK_SLUG) records = pool.imap(add_framework_info(client, FRAMEWORK_SLUG), records) records = filter(lambda record: record['onFramework'], records) records = pool.imap(add_draft_services(client, FRAMEWORK_SLUG), records) services = itertools.chain.from_iterable(record['services'] for record in records) services = filter( lambda record: record['lot'] == 'user-research-studios' and record['status'] == 'submitted', services) return services
def find_suppliers_with_details(client, content_loader, framework_slug, supplier_ids=None): pool = ThreadPool(30) content_loader.load_manifest(framework_slug, 'declaration', 'declaration') declaration_content = content_loader.get_manifest(framework_slug, 'declaration') records = find_suppliers(client, framework_slug, supplier_ids) records = pool.imap(add_supplier_info(client), records) records = pool.imap(add_framework_info(client, framework_slug), records) records = pool.imap(add_draft_counts(client, framework_slug), records) records = map(add_failed_questions(declaration_content), records) return records
def find_services_by_lot(client, framework_slug, lot_slug): pool = ThreadPool(30) service_adder = add_draft_services(client, framework_slug, lot=lot_slug, status="submitted") records = find_suppliers(client, framework_slug) records = pool.imap(add_supplier_info(client), records) records = pool.imap(add_framework_info(client, framework_slug), records) records = pool.imap(service_adder, records) records = filter(lambda record: len(record["services"]) > 0, records) return records
def get_used_properties(self, set_ids=None, article_ids=None, **filters): """ Returns a sequency of property names in use in the specified set(s) (or setids) """ if set_ids is not None: filters["sets"] = set_ids if article_ids is not None: filters["ids"] = article_ids all_properties = self.get_properties() flexible_properties = set(all_properties) - set(ALL_FIELDS) body = {"query": {"bool": {"must": [ build_filter(**filters), {"exists": {"field": "fakeprop"}} ]}}} bodies = (copy.deepcopy(body) for _ in range(len(flexible_properties))) pool = ThreadPool() results = pool.imap(self._get_used_properties, zip(bodies, flexible_properties)) try: for found, prop in zip(results, flexible_properties): if found: yield prop finally: pool.close()
def run_tidy(sha="HEAD", is_rev_range=False): diff_cmdline = ["git", "diff" if is_rev_range else "show", sha] # Figure out which paths changed in the given diff. changed_paths = subprocess.check_output(diff_cmdline + ["--name-only", "--pretty=format:"]).splitlines() changed_paths = [p for p in changed_paths if p] # Produce a separate diff for each file and run clang-tidy-diff on it # in parallel. def tidy_on_path(path): patch_file = tempfile.NamedTemporaryFile() cmd = diff_cmdline + [ "--src-prefix=%s/" % ROOT, "--dst-prefix=%s/" % ROOT, "--", path] subprocess.check_call(cmd, stdout=patch_file, cwd=ROOT) cmdline = [CLANG_TIDY_DIFF, "-clang-tidy-binary", CLANG_TIDY, "-p0", "--", "-DCLANG_TIDY"] + compile_flags.get_flags() return subprocess.check_output( cmdline, stdin=file(patch_file.name), cwd=ROOT) pool = ThreadPool(multiprocessing.cpu_count()) try: return "".join(pool.imap(tidy_on_path, changed_paths)) except KeyboardInterrupt as ki: sys.exit(1) finally: pool.terminate() pool.join()
def make_requests(self, type, args, plugins=None): pool = ThreadPool(6) if not plugins: plugins = [p['name'] for p in self.plugins] reqs = [(type, p, arg) for p in plugins for arg in args] #not sure why returning this doesn't work for x in pool.imap(self._run, reqs): yield x
def get_reviews_from_imdb(movie_start_id, movie_end_id): """ save movies reviews in storage. Args: movie_start_id: the start of the range of the movies. movie_end_id: the end of the range of the movies. """ thread_pool = ThreadPool() block_size = round((movie_end_id - movie_start_id) / MAX_THREADS) for results in thread_pool.imap(get_reviews_from_single_movie, xrange(movie_start_id, movie_end_id), block_size): if results is not None: storage.add_reviews(filter(lambda result: result[0] != "", results))
def run_attack(attack, pipe): """ Given an attack function to run, and a Connection object through which to communicate, receive a network and set of fractions to remove, and simulate attacks on that network for each fraction of nodes. Puts S1/N back through the pipe for each fraction. """ network = pipe.recv() fractions = pipe.recv() N = len(network) nodes_to_remove = [int(round(f * N)) for f in fractions] thread_pool = ThreadPool(5) results = thread_pool.imap(lambda x: attack(network, x), nodes_to_remove) for res in results: pipe.send(gc_size(res, N)) pipe.close()
def evaluate_retrieval(model, entity_model, eval_items): pool = ThreadPool() def entity_rank((entity, word_idxs, doc_len)): if word_idxs: entity_id = entity_model.entities[entity] scores = entity_model.score(model, entity_model.vectors, word_idxs) rank = np.sum(scores >= scores[entity_id]) else: rank = entity_vecs.shape[0] return int(np.log2(doc_len)), np.log2(rank) ranks = defaultdict(list) for size, rank in pool.imap(entity_rank, eval_items): ranks[size].append(rank) sorted_ranks = sorted(ranks.iteritems()) logging.info('%s overall score: %.3f by size: %s', type(entity_model).__name__, np.mean(np.hstack(ranks.values())), ' '.join('%d: %.3f' % (k, np.mean(v)) for k, v in sorted_ranks))
def get_schools_private(self, iterable): pool = ThreadPool(64) list(pool.imap(self.handler_schools_private, iterable))
def test_synchronize_with_different_param(self): demo = LockDemo() pool = ThreadPool(2) pool.imap(demo.foo2, range(2)) sleep(0.02) assert_that(demo.call_count, equal_to(2))
def get_data_public(self): pool = ThreadPool(64) list(pool.imap(self.handler_data_public, self.__schools_links_public))
def check_proxy(proxies): """Return validation array for a list of proxies.""" pool = ThreadPool(processes=512) proxy_source = itertools.product(proxies, PROXY_MAP) return [p for p in pool.imap(_is_valid_proxy, proxy_source) if p]
def main(): args = parse_args() socket.setdefaulttimeout(args.timeout) couch_server = mkcouch(args.couch) sessions_db_name = args.sessions_db_name try: sessions_db = couch_server.create(sessions_db_name) except couchdb.PreconditionFailed: sessions_db = couch_server[sessions_db_name] if args.resume or args.resume is None: session_id = args.resume if session_id is None: current_doc = sessions_db['$current'] session_id = current_doc['session_id'] print('Resuming session %s' % session_id) session_doc = sessions_db[session_id] site_host = session_doc['site'] scheme, host = scheme_and_host(site_host) db_name = session_doc['db_name'] session_doc['resumed_at'] = datetime.utcnow().isoformat() if args.start: start_page_name = args.start else: start_page_name = session_doc.get('last_page_name', args.start) if args.desc: descending = True else: descending = session_doc.get('descending', False) sessions_db[session_id] = session_doc else: site_host = args.site db_name = args.db start_page_name = args.start descending = args.desc if not site_host: print('Site to scrape is not specified') raise SystemExit(1) scheme, host = scheme_and_host(site_host) if not db_name: db_name = host.replace('.', '-') session_id = '-'.join((db_name, str(int(time.time())), str(int(1000*random.random())))) print('Starting session %s' % session_id) sessions_db[session_id] = { 'created_at': datetime.utcnow().isoformat(), 'site': site_host, 'db_name': db_name, 'descending': descending } current_doc = sessions_db.get('$current', {}) current_doc['session_id'] = session_id sessions_db['$current'] = current_doc site = mwclient.Site((scheme, host), path=args.site_path, ext=args.site_ext) update_siteinfo(site, couch_server, db_name) if args.siteinfo_only: return try: db = couch_server.create(db_name) except couchdb.PreconditionFailed: db = couch_server[db_name] set_show_func(db) def titles_from_args(titles): for title in titles: if title.startswith('@'): with open(os.path.expanduser(title[1:])) as f: for line in f: yield line.strip() else: yield title def titles_from_recent_changes(timestamp): changes = site.recentchanges(start=timestamp, namespace=0, show='!minor|!redirect|!anon') for change in changes: title = change.get('title') if title: doc = db.get(title) doc_revid = doc.get('parse', {}).get('revid') if doc else None revid = change.get('revid') if doc_revid == revid: continue yield title if args.titles: pages = (site.Pages[title.decode('utf8')] for title in titles_from_args(args.titles)) elif args.changes_since or args.recent: if args.recent: recent_days = args.recent_days changes_since = datetime.strftime( datetime.utcnow() + timedelta(days=-recent_days), '%Y%m%d%H%M%S') else: changes_since = args.changes_since.ljust(14, '0') print('Getting recent changes (since %s)' % changes_since) pages = (site.Pages[title] for title in titles_from_recent_changes(changes_since)) else: print('Starting at %s' % start_page_name) pages = site.allpages(start=start_page_name, dir='descending' if descending else 'ascending') #threads are updating the same session document, #we don't want to have conflicts lock = RLock() def inc_count(count_name): with lock: session_doc = sessions_db[session_id] count = session_doc.get(count_name, 0) session_doc[count_name] = count + 1 sessions_db[session_id] = session_doc def update_session(title): with lock: session_doc = sessions_db[session_id] session_doc['last_page_name'] = title session_doc['updated_at'] = datetime.utcnow().isoformat() sessions_db[session_id] = session_doc def process(page): title = page.name if not page.exists: print('Not found: %s' % title) inc_count('not_found') if args.delete_not_found: try: del db[title] except couchdb.ResourceNotFound: print('%s was not in the database' % title) except couchdb.ResourceConflict: print('Conflict while deleting %s' % title) else: print('%s removed from the database' % title) return try: aliases = set() redirect_count = 0 while page.redirect: redirect_count += 1 redirect_target = redirects_to(site, page.name) frag = redirect_target.fragment if frag: alias = (title, frag) else: alias = title aliases.add(alias) page = redirect_target.page print('%s ==> %s' % ( title, page.name + (('#'+frag) if frag else ''))) if redirect_count >= 10: print('Too many redirect levels: %r' % aliases) break title = page.name if page.redirect: print('Failed to resolve redirect %s', title) inc_count('failed_redirect') return doc = db.get(title) if doc: current_aliases = set() for alias in doc.get('aliases', ()): if isinstance(alias, list): alias = tuple(alias) current_aliases.add(alias) if not aliases.issubset(current_aliases): merged_aliases = aliases|current_aliases #remove aliases without fragment if one with fragment is present #this is mostly to cleanup aliases in old scrapes to_remove = set() for alias in merged_aliases: if isinstance(alias, tuple): to_remove.add(alias[0]) merged_aliases = merged_aliases - to_remove doc['aliases'] = list(merged_aliases) db[title] = doc revid = doc.get('parse', {}).get('revid') if page.revision == revid: print('%s is up to date (rev. %s), skipping' % (title, revid)) inc_count('up_to_date') return else: inc_count('updated') print('New rev. %s is available for %s (have rev. %s)' % (page.revision, title, revid)) parse = site.api('parse', page=title) except KeyboardInterrupt as ki: print ('Caught KeyboardInterrupt', ki) thread.interrupt_main() except couchdb.ResourceConflict: print('Update conflict, skipping: %s' % title) return except Exception: print('Failed to process %s:' % title) traceback.print_exc() inc_count('error') return if doc: doc.update(parse) else: inc_count('new') doc = parse if aliases: doc['aliases'] = list(aliases) try: db[title] = doc except couchdb.ResourceConflict: print('Update conflict, skipping: %s' % title) return import pylru seen = pylru.lrucache(10000) def ipages(pages): for index, page in enumerate(pages): title = page.name print('%7s %s' % (index, title)) if title in seen: print('Already saw %s, skipping' % (title,)) continue seen[title] = True update_session(title) yield page with flock(os.path.join(tempfile.gettempdir(), hashlib.sha1(host).hexdigest())): if args.speed: pool = ThreadPool(processes=args.speed*2) for _result in pool.imap(process, ipages(pages)): pass else: for page in ipages(pages): process(page)
def untile_image(self, output_destination): """ Downloads image tiles and joins them. These processes are done in parallel. """ self.num_tiles = self.x_tiles * self.y_tiles self.num_downloaded = 0 self.num_joined = 0 # Progressbars for downloading and joining. download_progressbar = None joining_progressbar = None if progressbar: download_progressbar = progressbar.ProgressBar( widgets=['Loading tiles: ', progressbar.Counter(), '/', str(self.num_tiles), ' ', progressbar.Bar('>', left='[', right=']'), ' ', progressbar.ETA()], maxval=self.num_tiles ) joining_progressbar = progressbar.ProgressBar( widgets=['Joining tiles: ', progressbar.Counter(), '/', str(self.num_tiles), ' ', progressbar.Bar('>', left='[', right=']'), ' ', progressbar.ETA()], maxval=self.num_tiles ) download_progressbar.start() if self.no_download: download_progressbar.finish() joining_progressbar.start() def update_progressbars(): # Update UI info if progressbar: if self.num_downloaded < self.num_tiles: download_progressbar.update(self.num_downloaded) elif not download_progressbar.finished: download_progressbar.finish() joining_progressbar.start() joining_progressbar.update(self.num_joined) # There are already images joined! else: joining_progressbar.update(self.num_joined) def local_tile_path(col, row): return os.path.join(self.tile_dir, "{}_{}.{}".format(col, row, self.ext)) def download(tile_position): col, row = tile_position url = self.get_tile_url(col, row) destination = local_tile_path(col, row) if not progressbar: self.log.debug("Loading tile (row {:3}, col {:3})".format(row, col)) try: download_url(url, destination) except urllib.error.HTTPError as e: self.num_downloaded += 1 self.log.warning( "{}. Tile {} (row {}, col {}) does not exist on the server." .format(e, url, row, col) ) return (None, None) self.num_downloaded += 1 return tile_position # Download tiles in self.nthreads parallel threads. tile_positions = itertools.product(range(self.x_tiles), range(self.y_tiles)) if not self.no_download: pool = ThreadPool(processes=self.nthreads) self.downloaded_iterator = pool.imap(download, tile_positions) else: self.downloaded_iterator = tile_positions self.num_downloaded = self.num_tiles def jplarge(self, joining_progressbar): """ Faster untilig algorithm, assembling columns separately, then assembling those into final image. Cuts down on the cost of constantly opening two huge final images. """ # Do tile joining in parallel with the downloading. # Use 4 temporary files for the joining process. tmpimgs = [] finalimage = [] tempinfo = {'tmp_': tmpimgs, 'final_': finalimage} for i in range(2): for f in iter(tempinfo): fhandle = tempfile.NamedTemporaryFile(suffix='.jpg', prefix=f, dir=self.tile_dir, delete=False) tempinfo[f].append(fhandle.name) fhandle.close() self.log.debug("Created temporary image file: " + tempinfo[f][i]) # The index of current_col temp image to be used for input, toggles between 0 and 1. active_tmp = 0 active_final = 0 # Join tiles into a single image in parallel to them being downloaded. try: subproc = None # Popen class of the most recently called subprocess. current_col = 0 tile_in_column = 0 for i, (col, row) in enumerate(self.downloaded_iterator): if col is None: self.log.debug("Missing col tile!") continue # Tile failed to download. if col == current_col: if not progressbar: self.log.debug("Adding tile (row {:3}, col {:3}) to the image".format(row, col)) # As the very first step create an (almost) empty temp column image, # with the target column dimensions. # Don't reuse old tempfile without overwriting it first - # if the file is broken, we want an empty space instead of an image from previous iteration. if tile_in_column == 0 and not current_col == self.x_tiles - 1: subproc = subprocess.Popen([self.jpegtran, '-copy', 'all', '-crop', '{:d}x{:d}+0+0'.format(self.tile_size, self.height), '-outfile', tmpimgs[active_tmp], local_tile_path(col, row) ]) subproc.wait() # Last column may have different width - create tempfile with correct dimensions elif tile_in_column == 0 and current_col == self.x_tiles - 1: subproc = subprocess.Popen([self.jpegtran, '-copy', 'all', '-crop', '{:d}x{:d}+0+0'.format(self.width - ((self.x_tiles - 1) * self.tile_size), self.height), '-outfile', tmpimgs[active_tmp], local_tile_path(col, row) ]) subproc.wait() # Not working on a complete column - just keep adding images. else: subproc = subprocess.Popen([self.jpegtran, '-perfect', '-copy', 'all', '-drop', '+{:d}+{:d}'.format(0, row * self.tile_size), local_tile_path(col, row), '-outfile', tmpimgs[active_tmp], tmpimgs[(active_tmp + 1) % 2] ]) subproc.wait() self.num_joined += 1 update_progressbars() # After untiling of a first column, # create a full sized temp image with the just untiled column if tile_in_column == self.y_tiles - 1 and current_col == 0: subproc = subprocess.Popen([self.jpegtran, '-perfect', '-copy', 'all', '-crop', '{:d}x{:d}+0+0'.format(self.width, self.height), '-outfile', finalimage[active_final], tmpimgs[active_tmp] ]) subproc.wait() current_col += 1 tile_in_column = 0 active_final = (active_final + 1) % 2 active_tmp = (active_tmp + 1) % 2 # Drop just untiled column (other then first) into the full sized temp image. elif tile_in_column == self.y_tiles - 1 and not current_col == 0: subproc = subprocess.Popen([self.jpegtran, '-perfect', '-copy', 'all', '-drop', '+{:d}+{:d}'.format(current_col * self.tile_size, 0), tmpimgs[active_tmp], '-outfile', finalimage[active_final], finalimage[(active_final + 1) % 2] ]) subproc.wait() current_col += 1 tile_in_column = 0 active_final = (active_final + 1) % 2 active_tmp = (active_tmp + 1) % 2 # No column completely untiled, keep working else: tile_in_column += 1 active_tmp = (active_tmp + 1) % 2 # toggle between the two temp images # Optimize the final image and write it to destination subproc = subprocess.Popen([self.jpegtran, '-copy', 'all', '-optimize', '-outfile', output_destination, finalimage[(active_final + 1) % 2] ]) subproc.wait() num_missing = self.num_tiles - self.num_joined if num_missing > 0: self.log.warning( "Image '{3}' is missing {0} tile{1}. " "You might want to download the image at a different zoom level " "(currently {2}) to get the missing part{1}." .format(num_missing, '' if num_missing == 1 else 's', self.zoom_level, output_destination) ) if progressbar and joining_progressbar.start_time is not None: joining_progressbar.finish() except KeyboardInterrupt: # Kill the jpegtran subprocess. if subproc and subproc.poll() is None: subproc.kill() raise finally: #Delete the temporary images. for i in range(2): os.unlink(tmpimgs[i]) os.unlink(finalimage[i]) # Select untiling algorithm # if self.algorithm == 'jt_xl': # jplarge(self, joining_progressbar) # elif self.algorithm == 'jt_std': # jpstandard(self, joining_progressbar) jplarge(self, joining_progressbar)
def process_batch_results(options): ppresults = PostProcessingResults() ##%% Expand some options for convenience output_dir = options.output_dir ##%% Prepare output dir os.makedirs(output_dir, exist_ok=True) ##%% Load ground truth if available ground_truth_indexed_db = None if options.ground_truth_json_file and len( options.ground_truth_json_file) > 0: ground_truth_indexed_db = IndexedJsonDb( options.ground_truth_json_file, b_normalize_paths=True, filename_replacements=options.ground_truth_filename_replacements) # Mark images in the ground truth as positive or negative n_negative, n_positive, n_unknown, n_ambiguous = mark_detection_status( ground_truth_indexed_db, negative_classes=options.negative_classes, unknown_classes=options.unlabeled_classes) print( 'Finished loading and indexing ground truth: {} negative, {} positive, {} unknown, {} ambiguous' .format(n_negative, n_positive, n_unknown, n_ambiguous)) ##%% Load detection results if options.api_detection_results is None: detection_results, other_fields = load_api_results( options.api_output_file, normalize_paths=True, filename_replacements=options.api_output_filename_replacements) ppresults.api_detection_results = detection_results ppresults.api_other_fields = other_fields else: print('Bypassing detection results loading...') assert options.api_other_fields is not None detection_results = options.api_detection_results other_fields = options.api_other_fields detection_categories_map = other_fields['detection_categories'] if 'classification_categories' in other_fields: classification_categories_map = other_fields[ 'classification_categories'] else: classification_categories_map = {} # Add a column (pred_detection_label) to indicate predicted detection status, not separating out the classes if options.include_almost_detections: detection_results['pred_detection_label'] = DetectionStatus.DS_ALMOST confidences = detection_results['max_detection_conf'] detection_results.loc[ confidences >= options.confidence_threshold, 'pred_detection_label'] = DetectionStatus.DS_POSITIVE detection_results.loc[ confidences < options.almost_detection_confidence_threshold, 'pred_detection_label'] = DetectionStatus.DS_NEGATIVE else: detection_results['pred_detection_label'] = \ np.where(detection_results['max_detection_conf'] >= options.confidence_threshold, DetectionStatus.DS_POSITIVE, DetectionStatus.DS_NEGATIVE) n_positives = sum(detection_results['pred_detection_label'] == DetectionStatus.DS_POSITIVE) print( 'Finished loading and preprocessing {} rows from detector output, predicted {} positives' .format(len(detection_results), n_positives)) if options.include_almost_detections: n_almosts = sum(detection_results['pred_detection_label'] == DetectionStatus.DS_ALMOST) print('...and {} almost-positives'.format(n_almosts)) ##%% If we have ground truth, remove images we can't match to ground truth if ground_truth_indexed_db is not None: b_match = [False] * len(detection_results) detector_files = detection_results['file'].tolist() # fn = detector_files[0]; print(fn) for i_fn, fn in enumerate(detector_files): # assert fn in ground_truth_indexed_db.filename_to_id, 'Could not find ground truth for row {} ({})'.format(i_fn,fn) if fn in ground_truth_indexed_db.filename_to_id: b_match[i_fn] = True print('Confirmed filename matches to ground truth for {} of {} files'. format(sum(b_match), len(detector_files))) detection_results = detection_results[b_match] detector_files = detection_results['file'].tolist() assert len( detector_files ) > 0, 'No detection files available, possible ground truth path issue?' print('Trimmed detection results to {} files'.format( len(detector_files))) ##%% Sample images for visualization images_to_visualize = detection_results if options.num_images_to_sample > 0 and options.num_images_to_sample <= len( detection_results): images_to_visualize = images_to_visualize.sample( options.num_images_to_sample, random_state=options.sample_seed) output_html_file = '' style_header = """<head> <style type="text/css"> <!-- a { text-decoration:none; } body { font-family:segoe ui, calibri, "trebuchet ms", verdana, arial, sans-serif; } div.contentdiv { margin-left:20px; } --> </style> </head>""" ##%% Fork here depending on whether or not ground truth is available # If we have ground truth, we'll compute precision/recall and sample tp/fp/tn/fn. # # Otherwise we'll just visualize detections/non-detections. if ground_truth_indexed_db is not None: ##%% Detection evaluation: compute precision/recall # numpy array of detection probabilities p_detection = detection_results['max_detection_conf'].values n_detections = len(p_detection) # numpy array of bools (0.0/1.0), and -1 as null value gt_detections = np.zeros(n_detections, dtype=float) for i_detection, fn in enumerate(detector_files): image_id = ground_truth_indexed_db.filename_to_id[fn] image = ground_truth_indexed_db.image_id_to_image[image_id] detection_status = image['_detection_status'] if detection_status == DetectionStatus.DS_NEGATIVE: gt_detections[i_detection] = 0.0 elif detection_status == DetectionStatus.DS_POSITIVE: gt_detections[i_detection] = 1.0 else: gt_detections[i_detection] = -1.0 # Don't include ambiguous/unknown ground truth in precision/recall analysis b_valid_ground_truth = gt_detections >= 0.0 p_detection_pr = p_detection[b_valid_ground_truth] gt_detections_pr = gt_detections[b_valid_ground_truth] print('Including {} of {} values in p/r analysis'.format( np.sum(b_valid_ground_truth), len(b_valid_ground_truth))) precisions, recalls, thresholds = precision_recall_curve( gt_detections_pr, p_detection_pr) # For completeness, include the result at a confidence threshold of 1.0 thresholds = np.append(thresholds, [1.0]) precisions_recalls = pd.DataFrame( data={ 'confidence_threshold': thresholds, 'precision': precisions, 'recall': recalls }) # Compute and print summary statistics average_precision = average_precision_score(gt_detections_pr, p_detection_pr) print('Average precision: {:.1%}'.format(average_precision)) # Thresholds go up throughout precisions/recalls/thresholds; find the last # value where recall is at or above target. That's our precision @ target recall. target_recall = 0.9 b_above_target_recall = np.where(recalls >= target_recall) if not np.any(b_above_target_recall): precision_at_target_recall = 0.0 else: i_target_recall = np.argmax(b_above_target_recall) precision_at_target_recall = precisions[i_target_recall] print('Precision at {:.1%} recall: {:.1%}'.format( target_recall, precision_at_target_recall)) cm = confusion_matrix( gt_detections_pr, np.array(p_detection_pr) > options.confidence_threshold) # Flatten the confusion matrix tn, fp, fn, tp = cm.ravel() precision_at_confidence_threshold = tp / (tp + fp) recall_at_confidence_threshold = tp / (tp + fn) f1 = 2.0 * (precision_at_confidence_threshold * recall_at_confidence_threshold) / \ (precision_at_confidence_threshold + recall_at_confidence_threshold) print( 'At a confidence threshold of {:.1%}, precision={:.1%}, recall={:.1%}, f1={:.1%}' .format(options.confidence_threshold, precision_at_confidence_threshold, recall_at_confidence_threshold, f1)) ##%% Collect classification results, if they exist classifier_accuracies = [] # Mapping of classnames to idx for the confusion matrix. # # The lambda is actually kind of a hack, because we use assume that # the following code does not reassign classname_to_idx classname_to_idx = collections.defaultdict( lambda: len(classname_to_idx)) # Confusion matrix as defaultdict of defaultdict # # Rows / first index is ground truth, columns / second index is predicted category classifier_cm = collections.defaultdict( lambda: collections.defaultdict(lambda: 0)) # iDetection = 0; fn = detector_files[iDetection]; print(fn) assert len(detector_files) == len(detection_results) for iDetection, fn in enumerate(detector_files): image_id = ground_truth_indexed_db.filename_to_id[fn] image = ground_truth_indexed_db.image_id_to_image[image_id] detections = detection_results['detections'].iloc[iDetection] pred_class_ids = [det['classifications'][0][0] \ for det in detections if 'classifications' in det.keys()] pred_classnames = [ classification_categories_map[pd] for pd in pred_class_ids ] # If this image has classification predictions, and an unambiguous class # annotated, and is a positive image... if len(pred_classnames) > 0 \ and '_unambiguous_category' in image.keys() \ and image['_detection_status'] == DetectionStatus.DS_POSITIVE: # The unambiguous category, we make this a set for easier handling afterward gt_categories = set([image['_unambiguous_category']]) pred_categories = set(pred_classnames) # Compute the accuracy as intersection of union, # i.e. (# of categories in both prediciton and GT) # divided by (# of categories in either prediction or GT # # In case of only one GT category, the result will be 1.0, if # prediction is one category and this category matches GT # # It is 1.0/(# of predicted top-1 categories), if the GT is # one of the predicted top-1 categories. # # It is 0.0, if none of the predicted categories is correct classifier_accuracies.append( len(gt_categories & pred_categories) / len(gt_categories | pred_categories)) image['_classification_accuracy'] = classifier_accuracies[-1] # Distribute this accuracy across all predicted categories in the # confusion matrix assert len(gt_categories) == 1 gt_class_idx = classname_to_idx[list(gt_categories)[0]] for pred_category in pred_categories: pred_class_idx = classname_to_idx[pred_category] classifier_cm[gt_class_idx][pred_class_idx] += 1 # ...for each file in the detection results # If we have classification results if len(classifier_accuracies) > 0: # Build confusion matrix as array from classifier_cm all_class_ids = sorted(classname_to_idx.values()) classifier_cm_array = np.array( [[classifier_cm[r_idx][c_idx] for c_idx in all_class_ids] for r_idx in all_class_ids], dtype=float) classifier_cm_array /= ( classifier_cm_array.sum(axis=1, keepdims=True) + 1e-7) # Print some statistics print("Finished computation of {} classification results".format( len(classifier_accuracies))) print("Mean accuracy: {}".format(np.mean(classifier_accuracies))) # Prepare confusion matrix output # Get confusion matrix as string sio = io.StringIO() np.savetxt(sio, classifier_cm_array * 100, fmt='%5.1f') cm_str = sio.getvalue() # Get fixed-size classname for each idx idx_to_classname = {v: k for k, v in classname_to_idx.items()} classname_list = [ idx_to_classname[idx] for idx in sorted(classname_to_idx.values()) ] classname_headers = [ '{:<5}'.format(cname[:5]) for cname in classname_list ] # Prepend class name on each line and add to the top cm_str_lines = [' ' * 16 + ' '.join(classname_headers)] cm_str_lines += [ '{:>15}'.format(cn[:15]) + ' ' + cm_line for cn, cm_line in zip(classname_list, cm_str.splitlines()) ] # Print formatted confusion matrix print("Confusion matrix: ") print(*cm_str_lines, sep='\n') # Plot confusion matrix # To manually add more space at bottom: plt.rcParams['figure.subplot.bottom'] = 0.1 # # Add 0.5 to figsize for every class. For two classes, this will result in # fig = plt.figure(figsize=[4,4]) fig = vis_utils.plot_confusion_matrix(classifier_cm_array, classname_list, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues, vmax=1.0, use_colorbar=True, y_label=True) cm_figure_relative_filename = 'confusion_matrix.png' cm_figure_filename = os.path.join(output_dir, cm_figure_relative_filename) plt.savefig(cm_figure_filename) plt.close(fig) # ...if we have classification results ##%% Render output # Write p/r table to .csv file in output directory pr_table_filename = os.path.join(output_dir, 'prec_recall.csv') precisions_recalls.to_csv(pr_table_filename, index=False) # Write precision/recall plot to .png file in output directory t = 'Precision-Recall curve: AP={:0.1%}, P@{:0.1%}={:0.1%}'.format( average_precision, target_recall, precision_at_target_recall) fig = vis_utils.plot_precision_recall_curve(precisions, recalls, t) pr_figure_relative_filename = 'prec_recall.png' pr_figure_filename = os.path.join(output_dir, pr_figure_relative_filename) plt.savefig(pr_figure_filename) # plt.show(block=False) plt.close(fig) ##%% Sampling # Sample true/false positives/negatives with correct/incorrect top-1 # classification and render to html # Accumulate html image structs (in the format expected by write_html_image_lists) # for each category, e.g. 'tp', 'fp', ..., 'class_bird', ... images_html = collections.defaultdict(lambda: []) # Add default entries by accessing them for the first time [images_html[res] for res in ['tp', 'tpc', 'tpi', 'fp', 'tn', 'fn']] for res in images_html.keys(): os.makedirs(os.path.join(output_dir, res), exist_ok=True) image_count = len(images_to_visualize) # Each element will be a list of 2-tuples, with elements [collection name,html info struct] rendering_results = [] # Each element will be a three-tuple with elements file,max_conf,detections files_to_render = [] # Assemble the information we need for rendering, so we can parallelize without # dealing with Pandas # i_row = 0; row = images_to_visualize.iloc[0] for _, row in images_to_visualize.iterrows(): # Filenames should already have been normalized to either '/' or '\' files_to_render.append( [row['file'], row['max_detection_conf'], row['detections']]) def render_image_with_gt(file_info): image_relative_path = file_info[0] max_conf = file_info[1] detections = file_info[2] # This should already have been normalized to either '/' or '\' image_id = ground_truth_indexed_db.filename_to_id.get( image_relative_path, None) if image_id is None: print('Warning: couldn' 't find ground truth for image {}'.format( image_relative_path)) return None image = ground_truth_indexed_db.image_id_to_image[image_id] annotations = ground_truth_indexed_db.image_id_to_annotations[ image_id] gt_status = image['_detection_status'] gt_presence = bool(gt_status) gt_classes = CameraTrapJsonUtils.annotations_to_classnames( annotations, ground_truth_indexed_db.cat_id_to_name) gt_class_summary = ','.join(gt_classes) if gt_status > DetectionStatus.DS_MAX_DEFINITIVE_VALUE: print( 'Skipping image {}, does not have a definitive ground truth status (status: {}, classes: {})' .format(image_id, gt_status, gt_class_summary)) return None detected = max_conf > options.confidence_threshold if gt_presence and detected: if '_classification_accuracy' not in image.keys(): res = 'tp' elif np.isclose(1, image['_classification_accuracy']): res = 'tpc' else: res = 'tpi' elif not gt_presence and detected: res = 'fp' elif gt_presence and not detected: res = 'fn' else: res = 'tn' display_name = '<b>Result type</b>: {}, <b>Presence</b>: {}, <b>Class</b>: {}, <b>Max conf</b>: {:0.2f}%, <b>Image</b>: {}'.format( res.upper(), str(gt_presence), gt_class_summary, max_conf * 100, image_relative_path) rendered_image_html_info = render_bounding_boxes( options.image_base_dir, image_relative_path, display_name, detections, res, detection_categories_map, classification_categories_map, options) image_result = None if len(rendered_image_html_info) > 0: image_result = [[res, rendered_image_html_info]] for gt_class in gt_classes: image_result.append([ 'class_{}'.format(gt_class), rendered_image_html_info ]) return image_result # ...def render_image_with_gt(file_info) start_time = time.time() if options.parallelize_rendering: if options.parallelize_rendering_n_cores is None: pool = ThreadPool() else: print('Rendering images with {} workers'.format( options.parallelize_rendering_n_cores)) pool = ThreadPool(options.parallelize_rendering_n_cores) rendering_results = list( tqdm(pool.imap(render_image_with_gt, files_to_render), total=len(files_to_render))) else: # file_info = files_to_render[0] for file_info in tqdm(files_to_render): rendering_results.append(render_image_with_gt(file_info)) elapsed = time.time() - start_time # Map all the rendering results in the list rendering_results into the # dictionary images_html image_rendered_count = 0 for rendering_result in rendering_results: if rendering_result is None: continue image_rendered_count += 1 for assignment in rendering_result: images_html[assignment[0]].append(assignment[1]) # Prepare the individual html image files image_counts = prepare_html_subpages(images_html, output_dir) print('{} images rendered (of {})'.format(image_rendered_count, image_count)) # Write index.html all_tp_count = image_counts['tp'] + image_counts['tpc'] + image_counts[ 'tpi'] total_count = all_tp_count + image_counts['tn'] + image_counts[ 'fp'] + image_counts['fn'] classification_detection_results = """ <a href="tpc.html">with all correct top-1 predictions (TPC)</a> ({})<br/> <a href="tpi.html">with one or more incorrect top-1 prediction (TPI)</a> ({})<br/> <a href="tp.html">without classification evaluation</a><sup>*</sup> ({})<br/>""".format( image_counts['tpc'], image_counts['tpi'], image_counts['tp']) index_page = """<html> {} <body> <h2>Evaluation</h2> <h3>Sample images</h3> <div style="margin-left:20px;"> <p>A sample of {} images, annotated with detections above {:.1%} confidence.</p> <a href="tp.html">True positives (TP)</a> ({}) ({:0.1%})<br/> CLASSIFICATION_PLACEHOLDER_1 <a href="tn.html">True negatives (TN)</a> ({}) ({:0.1%})<br/> <a href="fp.html">False positives (FP)</a> ({}) ({:0.1%})<br/> <a href="fn.html">False negatives (FN)</a> ({}) ({:0.1%})<br/> CLASSIFICATION_PLACEHOLDER_2 </div> """.format(style_header, image_count, options.confidence_threshold, all_tp_count, all_tp_count / total_count, image_counts['tn'], image_counts['tn'] / total_count, image_counts['fp'], image_counts['fp'] / total_count, image_counts['fn'], image_counts['fn'] / total_count) index_page += """ <h3>Detection results</h3> <div class="contentdiv"> <p>At a confidence threshold of {:0.1%}, precision={:0.1%}, recall={:0.1%}</p> <p><strong>Precision/recall summary for all {} images</strong></p><img src="{}"><br/> </div> """.format(options.confidence_threshold, precision_at_confidence_threshold, recall_at_confidence_threshold, len(detection_results), pr_figure_relative_filename) if len(classifier_accuracies) > 0: index_page = index_page.replace('CLASSIFICATION_PLACEHOLDER_1', classification_detection_results) index_page = index_page.replace( 'CLASSIFICATION_PLACEHOLDER_2', """<p><sup>*</sup>We do not evaluate the classification result of images if the classification information is missing, if the image contains categories like ‘empty’ or ‘human’, or if the image has multiple classification labels.</p>""") else: index_page = index_page.replace('CLASSIFICATION_PLACEHOLDER_1', '') index_page = index_page.replace('CLASSIFICATION_PLACEHOLDER_2', '') if len(classifier_accuracies) > 0: index_page += """ <h3>Classification results</h3> <div class="contentdiv"> <p>Classification accuracy: {:.2%}<br> The accuracy is computed only for images with exactly one classification label. The accuracy of an image is computed as 1/(number of unique detected top-1 classes), i.e. if the model detects multiple boxes with different top-1 classes, then the accuracy decreases and the image is put into 'TPI'.</p> <p>Confusion matrix:</p> <p><img src="{}"></p> <div style='font-family:monospace;display:block;'>{}</div> </div> """.format(np.mean(classifier_accuracies), cm_figure_relative_filename, "<br>".join(cm_str_lines).replace(' ', ' ')) # Show links to each GT class # # We could do this without classification results; currently we don't. if len(classname_to_idx) > 0: index_page += '<h3>Images of specific classes</h3><br/><div class="contentdiv">' # Add links to all available classes for cname in sorted(classname_to_idx.keys()): index_page += "<a href='class_{0}.html'>{0}</a> ({1})<br>".format( cname, len(images_html['class_{}'.format(cname)])) index_page += "</div>" # Close body and html tags index_page += "</body></html>" output_html_file = os.path.join(output_dir, 'index.html') with open(output_html_file, 'w') as f: f.write(index_page) print('Finished writing html to {}'.format(output_html_file)) # ...for each image ##%% Otherwise, if we don't have ground truth... else: ##%% Sample detections/non-detections # Accumulate html image structs (in the format expected by write_html_image_lists) # for each category images_html = collections.defaultdict(lambda: []) # Add default entries by accessing them for the first time [images_html[res] for res in ['detections', 'non_detections']] if options.include_almost_detections: images_html['almost_detections'] # Create output directories for res in images_html.keys(): os.makedirs(os.path.join(output_dir, res), exist_ok=True) image_count = len(images_to_visualize) has_classification_info = False # Each element will be a list of 2-tuples, with elements [collection name,html info struct] rendering_results = [] # Each element will be a three-tuple with elements file,max_conf,detections files_to_render = [] # Assemble the information we need for rendering, so we can parallelize without # dealing with Pandas # i_row = 0; row = images_to_visualize.iloc[0] for _, row in images_to_visualize.iterrows(): # Filenames should already have been normalized to either '/' or '\' files_to_render.append( [row['file'], row['max_detection_conf'], row['detections']]) # Local function for parallelization def render_image_no_gt(file_info): image_relative_path = file_info[0] max_conf = file_info[1] detections = file_info[2] detection_status = DetectionStatus.DS_UNASSIGNED if max_conf >= options.confidence_threshold: detection_status = DetectionStatus.DS_POSITIVE else: if options.include_almost_detections: if max_conf >= options.almost_detection_confidence_threshold: detection_status = DetectionStatus.DS_ALMOST else: detection_status = DetectionStatus.DS_NEGATIVE else: detection_status = DetectionStatus.DS_NEGATIVE if detection_status == DetectionStatus.DS_POSITIVE: res = 'detections' elif detection_status == DetectionStatus.DS_NEGATIVE: res = 'non_detections' else: assert detection_status == DetectionStatus.DS_ALMOST res = 'almost_detections' display_name = '<b>Result type</b>: {}, <b>Image</b>: {}, <b>Max conf</b>: {}'.format( res, image_relative_path, max_conf) rendering_options = copy.copy(options) if detection_status == DetectionStatus.DS_ALMOST: rendering_options.confidence_threshold = rendering_options.almost_detection_confidence_threshold rendered_image_html_info = render_bounding_boxes( options.image_base_dir, image_relative_path, display_name, detections, res, detection_categories_map, classification_categories_map, rendering_options) image_result = None if len(rendered_image_html_info) > 0: image_result = [[res, rendered_image_html_info]] for det in detections: if 'classifications' in det: top1_class = classification_categories_map[ det['classifications'][0][0]] image_result.append([ 'class_{}'.format(top1_class), rendered_image_html_info ]) return image_result # ...def render_image_no_gt(file_info): start_time = time.time() if options.parallelize_rendering: if options.parallelize_rendering_n_cores is None: pool = ThreadPool() else: print('Rendering images with {} workers'.format( options.parallelize_rendering_n_cores)) pool = ThreadPool(options.parallelize_rendering_n_cores) rendering_results = list( tqdm(pool.imap(render_image_no_gt, files_to_render), total=len(files_to_render))) else: for file_info in tqdm(files_to_render): rendering_results.append(render_image_no_gt(file_info)) elapsed = time.time() - start_time # Map all the rendering results in the list rendering_results into the # dictionary images_html image_rendered_count = 0 for rendering_result in rendering_results: if rendering_result is None: continue image_rendered_count += 1 for assignment in rendering_result: if 'class' in assignment[0]: has_classification_info = True images_html[assignment[0]].append(assignment[1]) # Prepare the individual html image files image_counts = prepare_html_subpages(images_html, output_dir) print('Rendered {} images (of {}) in {} ({} per image)'.format( image_rendered_count, image_count, humanfriendly.format_timespan(elapsed), humanfriendly.format_timespan(elapsed / image_rendered_count))) # Write index.HTML total_images = image_counts['detections'] + image_counts[ 'non_detections'] if options.include_almost_detections: total_images += image_counts['almost_detections'] assert (total_images == image_count) almost_detection_string = '' if options.include_almost_detections: almost_detection_string = ' (“almost detection” threshold at {:.1%})'.format( options.almost_detection_confidence_threshold) index_page = """<html>{}<body> <h2>Visualization of results</h2> <p>A sample of {} images, annotated with detections above {:.1%} confidence{}.</p> <h3>Sample images</h3> <div class="contentdiv"> <a href="detections.html">detections</a> ({}, {:.1%})<br/> <a href="non_detections.html">non-detections</a> ({}, {:.1%})<br/>""".format( style_header, image_count, options.confidence_threshold, almost_detection_string, image_counts['detections'], image_counts['detections'] / total_images, image_counts['non_detections'], image_counts['non_detections'] / total_images) if options.include_almost_detections: index_page += """<a href="almost_detections.html">almost-detections</a> ({}, {:.1%})<br/>""".format( image_counts['almost_detections'], image_counts['almost_detections'] / total_images) index_page += '</div>\n' if has_classification_info: index_page += "<h3>Images of detected classes</h3>" index_page += "<p>The same image might appear under multiple classes if multiple species were detected.</p>\n<div class='contentdiv'>\n" # Add links to all available classes for cname in sorted(classification_categories_map.values()): ccount = len(images_html['class_{}'.format(cname)]) if ccount > 0: index_page += "<a href='class_{}.html'>{}</a> ({})<br/>\n".format( cname, cname.lower(), ccount) index_page += "</div>\n" index_page += "</body></html>" output_html_file = os.path.join(output_dir, 'index.html') with open(output_html_file, 'w') as f: f.write(index_page) print('Finished writing html to {}'.format(output_html_file)) # os.startfile(output_html_file) # ...if we do/don't have ground truth ppresults.output_html_file = output_html_file return ppresults
def get_cities(self): pool = ThreadPool(16) list(pool.imap(self.handler_cities, self.__get_states()))
def get_school_category(self): pool = ThreadPool(64) list(pool.imap(self.handler_categories, self.__cities))
class StockExchange(): def __init__(self, max_worker=5): self._status = 'close' self._expire_at = 0 self._thread_pools = Pool(max_worker) @property def market_status(self): now = datetime.now() if self._expire_at < now.timestamp(): self._update_market_status(now) return self._status @property def market_am_open(self): now = datetime.now() if self._expire_at < now.timestamp(): self._update_market_status(now) return self._market_am_open @property def market_am_close(self): now = datetime.now() if self._expire_at < now.timestamp(): self._update_market_status(now) return self._market_am_close @property def market_fm_open(self): now = datetime.now() if self._expire_at < now.timestamp(): self._update_market_status(now) return self._market_fm_open @property def market_fm_close(self): now = datetime.now() if self._expire_at < now.timestamp(): self._update_market_status(now) return self._market_fm_close def _update_market_status(self, now): hq = self.hq('sh000001') hq_date = hq.loc['sh000001', 'date'] hq_time = hq.loc['sh000001', 'time'] hq_datetime = datetime.strptime(hq_date + ' ' + hq_time, '%Y-%m-%d %H:%M:%S') self._market_am_open = hq_datetime.replace(hour=9, minute=25, second=0, microsecond=0) self._market_am_close = hq_datetime.replace(hour=11, minute=30, second=0, microsecond=0) self._market_fm_open = hq_datetime.replace(hour=13, minute=0, second=0, microsecond=0) self._market_fm_close = hq_datetime.replace(hour=15, minute=0, second=0, microsecond=0) if hq_datetime.date() < now.date(): self._status = 'close' self._expire_at = (self._market_am_open + timedelta(days=1)).timestamp() else: if hq_datetime < self._market_am_close: self._status = 'trading' self._expire_at = self._market_am_close.timestamp() elif hq_datetime < self._market_fm_open: self._status = 'break' self._expire_at = self._market_fm_open.timestamp() elif hq_datetime < self._market_fm_close: self._status = 'trading' self._expire_at = self._market_fm_close.timestamp() else: self._status = 'close' self._expire_at = (self._market_am_open + timedelta(days=1)).timestamp() return def hq(self, *symbols): '''行情接口——默认使用新浪的行情接口 :param symbols: [ 'sz150023','sz150022','sz159915'] :return: 行情数据 ''' _symbols = [] for s in symbols: if isinstance(s, (list, set, tuple)): _symbols.extend(s) else: _symbols.append(s) symbols = _symbols urls = [] for i in range(0, len(symbols), _MAX_SINA_HQ_LIST): url = 'http://hq.sinajs.cn/?rn=%d&list=' % int(time.time()) urls.append(url + ','.join(symbols[i:i + _MAX_SINA_HQ_LIST])) respones = self._thread_pools.imap(requests.get, urls) data = list() for r in respones: lines = r.text.splitlines() for line in lines: d = line.split('"')[1].split(',') # 如果格式不正确,则返回nan if len(d) != len(_SINA_STOCK_KEYS): d = [np.nan] * len(_SINA_STOCK_KEYS) data.append(d) df = pd.DataFrame(data, index=symbols, columns=_SINA_STOCK_KEYS, dtype='float') df.index.name = 'symbol' df['volume'] = df['volume'] // 100 if 'volume' in _SINA_STOCK_KEYS and 'lasttrade' in _SINA_STOCK_KEYS and 'yclose' in _SINA_STOCK_KEYS: df.loc[df.volume == 0, 'lasttrade'] = df['yclose'] return df def bar(self, symbol, start='', end='', ktype='D', adjtype='forward'): ''' 获取k线的函数 :param symbol: 证券代码,如: sz150023,sz000001,sh000001 :param start: 起始日期,datetime or '2016-01-01' :param end: 终止日期, datetime or '2016-03-31' :param ktype: k线频率, D=日k线 W=周 M=月 5=5分钟 15=15分钟 30=30分钟 60=60分钟,默认为D :param adjtype: 复权调整类型, forward-前复权 afterward-后复权 None-不复权,默认为:forward :return: 返回DataFrame index : date columns : [symbol, open, high, low, close, volume] ''' # 将start,end转换成datetime格式 if isinstance(start, str): if start == '': start = datetime.now().replace(month=1, day=1, hour=0, minute=0) else: start = datetime.strptime(start, '%Y-%m-%d') if isinstance(end, str): if end == '': end = datetime.now().replace(hour=0, minute=0, microsecond=0) else: end = datetime.strptime(end, '%Y-%m-%d') # 判断是否需要使用最新的行情数据 if (end.date() >= datetime.today().date()) and (self.market_status != 'close'): hq = self._thread_pools.apply_async(self.hq, args=([symbol])) need_update_hq = True else: need_update_hq = False ktype = _KTYPE[ktype.upper()] if adjtype: adjtype = _ADJTYPE[adjtype.lower()] else: adjtype = '' data = [] results = [] for year in range(start.year - 1, end.year + 1): kwargs = { 'year': year, 'symbol': symbol, 'ktype': ktype, 'adjtype': adjtype } results.append( self._thread_pools.apply_async(self._parser_bar, kwds=kwargs)) for result in results: result = result.get() data.extend(result) data = self._thread_pools.map(lambda x: x[:6], data) df = pd.DataFrame( data, columns=['date', 'open', 'close', 'high', 'low', 'volume'], dtype='float') df = df.set_index('date') df = df.sort_index() if need_update_hq: hq = hq.get() date = hq.loc[symbol, 'date'] bar_last_close = df.ix[-1, 'close'] hq_yclose = hq.loc[symbol, 'yclose'] if bar_last_close != hq_yclose: adj = hq_yclose / bar_last_close df[['open', 'high', 'low', 'close']] = df[['open', 'high', 'low', 'close']] / adj df.loc[date, 'open'] = hq.loc[symbol, 'open'] df.loc[date, 'close'] = hq.loc[symbol, 'lasttrade'] df.loc[date, 'high'] = hq.loc[symbol, 'high'] df.loc[date, 'low'] = hq.loc[symbol, 'low'] df.loc[date, 'volume'] = hq.loc[symbol, 'volume'] else: df = df.loc[df.index <= end.strftime('%Y-%m-%d')] df['symbol'] = symbol df['yclose'] = df['close'].shift(1) df['chg'] = df['close'].pct_change(1) * 100 df['chg'] = df['chg'].round(2) df = df[[ 'symbol', 'open', 'high', 'low', 'close', 'yclose', 'chg', 'volume' ]] df = df.loc[df.index >= start.strftime('%Y-%m-%d')] return df @cache(TTLTimer(hours=9)) @retry(3) def _parser_bar(self, year, symbol, ktype, adjtype): url = _BAR_URL_TEMPLATE % (adjtype, year, symbol, ktype, year, year, adjtype, random()) r = requests.get(url) r.raise_for_status() d = r.text d = d.split('=')[1] d = json.loads(d)['data'] if '%s%s' % (adjtype, ktype) in d[symbol].keys(): d = d[symbol]['%s%s' % (adjtype, ktype)] else: d = d[symbol][ktype] return d def mbar(self, symbol, ktype='1', adjtype='forward'): pass def tick(self, symbol, date=None): params = {'symbol': symbol, 'date': date} r = requests.get(url='http://market.finance.sina.com.cn/downxls.php', params=params) tick_xls = BytesIO(r.content) tick_val = tick_xls.getvalue() if tick_val.find(b'alert') != -1 or len(tick_val) < 20: df = pd.DataFrame([], columns=[ 'date', 'symbol', 'type', 'price', 'change', 'amount' ]) df = df.set_index('date') return df else: df = pd.read_table(tick_xls, names=_TICK_COLUMNS, skiprows=[0], encoding='GBK') df['date'] = df['time'].apply(lambda x: '%s %s' % (date, x)) df = df.set_index('date') d = {'买盘': 'B', '卖盘': 'S', '中性盘': 'M'} df['type'] = df['type'].apply(lambda x: d[x]) df['symbol'] = symbol df = df.sort_index() return df[['symbol', 'type', 'price', 'change', 'amount']]
def process_images(db_path, output_dir, image_base_dir, options=None): """ Writes images and html to output_dir to visualize the annotations in the json file db_path. db_path can also be a previously-loaded database. Returns the html filename and the database: return htmlOutputFile,image_db """ if options is None: options = DbVizOptions() print(options.__dict__) os.makedirs(os.path.join(output_dir, 'rendered_images'), exist_ok=True) assert (os.path.isdir(image_base_dir)) if isinstance(db_path, str): assert (os.path.isfile(db_path)) print('Loading database from {}...'.format(db_path)) image_db = json.load(open(db_path)) print('...done') elif isinstance(db_path, dict): print('Using previously-loaded DB') image_db = db_path else: raise ValueError('Illegal dictionary or filename') annotations = image_db['annotations'] images = image_db['images'] categories = image_db['categories'] # Optionally remove all images without bounding boxes, *before* sampling if options.trim_to_images_with_bboxes: bHasBbox = [False] * len(annotations) for iAnn, ann in enumerate(annotations): if 'bbox' in ann: assert isinstance(ann['bbox'], list) bHasBbox[iAnn] = True annotationsWithBboxes = list(compress(annotations, bHasBbox)) imageIDsWithBboxes = [x['image_id'] for x in annotationsWithBboxes] imageIDsWithBboxes = set(imageIDsWithBboxes) bImageHasBbox = [False] * len(images) for iImage, image in enumerate(images): imageID = image['id'] if imageID in imageIDsWithBboxes: bImageHasBbox[iImage] = True imagesWithBboxes = list(compress(images, bImageHasBbox)) images = imagesWithBboxes # Optionally remove images with specific labels, *before* sampling if options.classes_to_exclude is not None: print('Indexing database') indexed_db = IndexedJsonDb(image_db) bValidClass = [True] * len(images) for iImage, image in enumerate(images): classes = indexed_db.get_classes_for_image(image) for excludedClass in options.classes_to_exclude: if excludedClass in classes: bValidClass[iImage] = False break imagesWithValidClasses = list(compress(images, bValidClass)) images = imagesWithValidClasses # Put the annotations in a dataframe so we can select all annotations for a given image print('Creating data frames') df_anno = pd.DataFrame(annotations) df_img = pd.DataFrame(images) # Construct label map label_map = {} for cat in categories: label_map[int(cat['id'])] = cat['name'] # Take a sample of images if options.num_to_visualize is not None: df_img = df_img.sample(n=options.num_to_visualize, random_state=options.random_seed) images_html = [] # Set of dicts representing inputs to render_db_bounding_boxes: # # bboxes, boxClasses, image_path rendering_info = [] print('Preparing rendering list') # iImage = 0 for iImage in tqdm(range(len(df_img))): img_id = df_img.iloc[iImage]['id'] img_relative_path = df_img.iloc[iImage]['file_name'] img_path = os.path.join( image_base_dir, image_filename_to_path(img_relative_path, image_base_dir)) annos_i = df_anno.loc[df_anno['image_id'] == img_id, :] # all annotations on this image bboxes = [] boxClasses = [] # All the class labels we've seen for this image (with out without bboxes) imageCategories = set() annotationLevelForImage = '' # Iterate over annotations for this image # iAnn = 0; anno = annos_i.iloc[iAnn] for iAnn, anno in annos_i.iterrows(): if 'sequence_level_annotation' in anno: bSequenceLevelAnnotation = anno['sequence_level_annotation'] if bSequenceLevelAnnotation: annLevel = 'sequence' else: annLevel = 'image' if annotationLevelForImage == '': annotationLevelForImage = annLevel elif annotationLevelForImage != annLevel: annotationLevelForImage = 'mixed' categoryID = anno['category_id'] categoryName = label_map[categoryID] if options.add_search_links: categoryName = categoryName.replace('"', '') categoryName = '<a href="https://www.bing.com/images/search?q={}">{}</a>'.format( categoryName, categoryName) imageCategories.add(categoryName) if 'bbox' in anno: bbox = anno['bbox'] if isinstance(bbox, float): assert math.isnan( bbox ), "I shouldn't see a bbox that's neither a box nor NaN" continue bboxes.append(bbox) boxClasses.append(anno['category_id']) imageClasses = ', '.join(imageCategories) file_name = '{}_gtbbox.jpg'.format(img_id.lower().split('.jpg')[0]) file_name = file_name.replace('/', '~') rendering_info.append({ 'bboxes': bboxes, 'boxClasses': boxClasses, 'img_path': img_path, 'output_file_name': file_name }) labelLevelString = '' if len(annotationLevelForImage) > 0: labelLevelString = ' (annotation level: {})'.format( annotationLevelForImage) # We're adding html for an image before we render it, so it's possible this image will # fail to render. For applications where this script is being used to debua a database # (the common case?), this is useful behavior, for other applications, this is annoying. # # TODO: optionally write html only for images where rendering succeeded images_html.append({ 'filename': '{}/{}'.format('rendered_images', file_name), 'title': '{}<br/>{}, number of boxes: {}, class labels: {}{}'.format( img_relative_path, img_id, len(bboxes), imageClasses, labelLevelString), 'textStyle': 'font-family:verdana,arial,calibri;font-size:80%;text-align:left;margin-top:20;margin-bottom:5' }) # ...for each image def render_image_info(rendering_info): img_path = rendering_info['img_path'] bboxes = rendering_info['bboxes'] bboxClasses = rendering_info['boxClasses'] output_file_name = rendering_info['output_file_name'] if not os.path.exists(img_path): print('Image {} cannot be found'.format(img_path)) return try: original_image = vis_utils.open_image(img_path) original_size = original_image.size image = vis_utils.resize_image(original_image, options.viz_size[0], options.viz_size[1]) except Exception as e: print('Image {} failed to open. Error: {}'.format(img_path, e)) return vis_utils.render_db_bounding_boxes(boxes=bboxes, classes=bboxClasses, image=image, original_size=original_size, label_map=label_map) image.save( os.path.join(output_dir, 'rendered_images', output_file_name)) # ...def render_image_info print('Rendering images') start_time = time.time() if options.parallelize_rendering: if options.parallelize_rendering_n_cores is None: pool = ThreadPool() else: print('Rendering images with {} workers'.format( options.parallelize_rendering_n_cores)) pool = ThreadPool(options.parallelize_rendering_n_cores) tqdm(pool.imap(render_image_info, rendering_info), total=len(rendering_info)) else: for file_info in tqdm(rendering_info): render_image_info(file_info) elapsed = time.time() - start_time print('Rendered {} images in {}'.format( len(rendering_info), humanfriendly.format_timespan(elapsed))) if options.sort_by_filename: images_html = sorted(images_html, key=lambda x: x['filename']) htmlOutputFile = os.path.join(output_dir, 'index.html') htmlOptions = options.htmlOptions if isinstance(db_path, str): htmlOptions[ 'headerHtml'] = '<h1>Sample annotations from {}</h1>'.format( db_path) else: htmlOptions['headerHtml'] = '<h1>Sample annotations</h1>' write_html_image_list(filename=htmlOutputFile, images=images_html, options=htmlOptions) print('Visualized {} images, wrote results to {}'.format( len(images_html), htmlOutputFile)) return htmlOutputFile, image_db
class ThreadPoolStrategy(ConcurrentStrategy, _PoolRunnableStrategy, _Resultable): _Thread_Pool: ThreadPool = None _Thread_List: List[Union[ApplyResult, AsyncResult]] = None def __init__(self, pool_size: int): super().__init__(pool_size=pool_size) def initialization(self, queue_tasks: Optional[Union[_BaseQueueTask, _BaseList]] = None, features: Optional[Union[_BaseFeatureAdapterFactory, _BaseList]] = None, *args, **kwargs) -> None: super(ThreadPoolStrategy, self).initialization(queue_tasks=queue_tasks, features=features, *args, **kwargs) # Initialize and build the Processes Pool. __pool_initializer: Callable = kwargs.get("pool_initializer", None) __pool_initargs: IterableType = kwargs.get("pool_initargs", None) self._Thread_Pool = ThreadPool(processes=self.pool_size, initializer=__pool_initializer, initargs=__pool_initargs) def apply(self, tasks_size: int, function: Callable, args: Tuple = (), kwargs: Dict = {}) -> None: self.reset_result() __process_running_result = None try: __process_running_result = [ self._Thread_Pool.apply(func=function, args=args, kwds=kwargs) for _ in range(tasks_size) ] __exception = None __process_run_successful = True except Exception as e: __exception = e __process_run_successful = False # Save Running result state and Running result value as dict self._result_saving(successful=__process_run_successful, result=__process_running_result, exception=None) def async_apply(self, tasks_size: int, function: Callable, args: Tuple = (), kwargs: Dict = {}, callback: Callable = None, error_callback: Callable = None) -> None: self.reset_result() self._Thread_List = [ self._Thread_Pool.apply_async(func=function, args=args, kwds=kwargs, callback=callback, error_callback=error_callback) for _ in range(tasks_size) ] for process in self._Thread_List: _process_running_result = None _process_run_successful = None _exception = None try: _process_running_result = process.get() _process_run_successful = process.successful() except Exception as e: _exception = e _process_run_successful = False # Save Running result state and Running result value as dict self._result_saving(successful=_process_run_successful, result=_process_running_result, exception=_exception) def apply_with_iter(self, functions_iter: List[Callable], args_iter: List[Tuple] = None, kwargs_iter: List[Dict] = None) -> None: self.reset_result() __process_running_result = None if args_iter is None: args_iter = [() for _ in functions_iter] if kwargs_iter is None: kwargs_iter = [{} for _ in functions_iter] try: __process_running_result = [ self._Thread_Pool.apply(func=_func, args=_args, kwds=_kwargs) for _func, _args, _kwargs in zip(functions_iter, args_iter, kwargs_iter) ] __exception = None __process_run_successful = True except Exception as e: __exception = e __process_run_successful = False # Save Running result state and Running result value as dict self._result_saving(successful=__process_run_successful, result=__process_running_result, exception=__exception) def async_apply_with_iter( self, functions_iter: List[Callable], args_iter: List[Tuple] = None, kwargs_iter: List[Dict] = None, callback_iter: List[Callable] = None, error_callback_iter: List[Callable] = None) -> None: self.reset_result() if args_iter is None: args_iter = [() for _ in functions_iter] if kwargs_iter is None: kwargs_iter = [{} for _ in functions_iter] if callback_iter is None: callback_iter = [None for _ in functions_iter] if error_callback_iter is None: error_callback_iter = [None for _ in functions_iter] self._Thread_List = [ self._Thread_Pool.apply_async(func=_func, args=_args, kwds=_kwargs, callback=_callback) for _func, _args, _kwargs, _callback in zip( functions_iter, args_iter, kwargs_iter, callback_iter) ] for process in self._Thread_List: _process_running_result = None _process_run_successful = None _exception = None try: _process_running_result = process.get() _process_run_successful = process.successful() except Exception as e: _exception = e _process_run_successful = False # Save Running result state and Running result value as dict self._result_saving(successful=_process_run_successful, result=_process_running_result, exception=_exception) def map(self, function: Callable, args_iter: IterableType = (), chunksize: int = None) -> None: self.reset_result() __process_running_result = None try: __process_running_result = self._Thread_Pool.map( func=function, iterable=args_iter, chunksize=chunksize) __exception = None __process_run_successful = True except Exception as e: __exception = e __process_run_successful = False # Save Running result state and Running result value as dict for __result in (__process_running_result or []): self._result_saving(successful=__process_run_successful, result=__result, exception=None) def async_map(self, function: Callable, args_iter: IterableType = (), chunksize: int = None, callback: Callable = None, error_callback: Callable = None) -> None: self.reset_result() __map_result = self._Thread_Pool.map_async( func=function, iterable=args_iter, chunksize=chunksize, callback=callback, error_callback=error_callback) __process_running_result = __map_result.get() __process_run_successful = __map_result.successful() # Save Running result state and Running result value as dict for __result in (__process_running_result or []): self._result_saving(successful=__process_run_successful, result=__result, exception=None) def map_by_args(self, function: Callable, args_iter: IterableType[IterableType] = (), chunksize: int = None) -> None: self.reset_result() __process_running_result = None try: __process_running_result = self._Thread_Pool.starmap( func=function, iterable=args_iter, chunksize=chunksize) __exception = None __process_run_successful = True except Exception as e: __exception = e __process_run_successful = False # Save Running result state and Running result value as dict for __result in (__process_running_result or []): self._result_saving(successful=__process_run_successful, result=__result, exception=None) def async_map_by_args(self, function: Callable, args_iter: IterableType[IterableType] = (), chunksize: int = None, callback: Callable = None, error_callback: Callable = None) -> None: self.reset_result() __map_result = self._Thread_Pool.starmap_async( func=function, iterable=args_iter, chunksize=chunksize, callback=callback, error_callback=error_callback) __process_running_result = __map_result.get() __process_run_successful = __map_result.successful() # Save Running result state and Running result value as dict for __result in (__process_running_result or []): self._result_saving(successful=__process_run_successful, result=__result, exception=None) def imap(self, function: Callable, args_iter: IterableType = (), chunksize: int = 1) -> None: self.reset_result() __process_running_result = None try: imap_running_result = self._Thread_Pool.imap(func=function, iterable=args_iter, chunksize=chunksize) __process_running_result = [ result for result in imap_running_result ] __exception = None __process_run_successful = True except Exception as e: __exception = e __process_run_successful = False # Save Running result state and Running result value as dict for __result in (__process_running_result or []): self._result_saving(successful=__process_run_successful, result=__result, exception=None) def imap_unordered(self, function: Callable, args_iter: IterableType = (), chunksize: int = 1) -> None: self.reset_result() __process_running_result = None try: imap_running_result = self._Thread_Pool.imap_unordered( func=function, iterable=args_iter, chunksize=chunksize) __process_running_result = [ result for result in imap_running_result ] __exception = None __process_run_successful = True except Exception as e: __exception = e __process_run_successful = False # Save Running result state and Running result value as dict for __result in (__process_running_result or []): self._result_saving(successful=__process_run_successful, result=__result, exception=None) def _result_saving(self, successful: bool, result: List, exception: Exception) -> None: _thread_result = { "successful": successful, "result": result, "exception": exception } # Saving value into list self._Thread_Running_Result.append(_thread_result) def close(self) -> None: self._Thread_Pool.close() self._Thread_Pool.join() def terminal(self) -> None: self._Thread_Pool.terminate() def get_result(self) -> List[_ConcurrentResult]: return self.result() def _saving_process(self) -> List[_ThreadPoolResult]: _pool_results = [] for __result in self._Thread_Running_Result: _pool_result = _ThreadPoolResult() _pool_result.is_successful = __result["successful"] _pool_result.data = __result["result"] _pool_results.append(_pool_result) return _pool_results
def get_schools_public(self, iterable): pool = ThreadPool(64) list(pool.imap(self.handler_schools_public, iterable))
def test_iter(self): yielder = Yielder() pool = ThreadPool(processes=10) pool.imap(func=print_sample, iterable=iter(yielder.sample_gen, None), chunksize=1) pool.join()
def fast_accuracy(vocab, syn0, questions_file, restrict=100000, logger=logging): from multiprocessing.pool import ThreadPool pool = ThreadPool() ok_vocab = nlargest(restrict, vocab.iteritems(), key=lambda (_, item): item.count) ok_vocab.sort(key=lambda (_, item): item.index) ok_proj_vocab = dict((word, proj_idx) for proj_idx, (word, _) in enumerate(ok_vocab)) ok_syn0 = syn0[[item.index for _, item in ok_vocab]] # normalize for i in xrange(ok_syn0.shape[0]): ok_syn0[i] /= np.sqrt(np.sum(ok_syn0[i]**2)) questions = [] with open(questions_file) as fin: cur_section = None for line_no, line in enumerate(fin): if line.startswith(': '): cur_section = line.lstrip(': ').strip() else: if cur_section is None: raise ValueError('Missing section header') try: # TODO assumes vocabulary preprocessing uses lowercase, too... wa, wb, wc, wexpected = [word.lower() for word in line.split()] except: logger.info("skipping invalid line #%i in %s" % (line_no, questions)) continue try: a = ok_proj_vocab[wa] b = ok_proj_vocab[wb] c = ok_proj_vocab[wc] expected = ok_proj_vocab[wexpected] except KeyError: logger.debug("skipping line #%i with OOV words: %s" % (line_no, line)) continue questions.append((cur_section, a, b, c, expected)) def check(question): section, a, b, c, expected = question ignore = set([a, b, c]) mean = np.zeros_like(syn0[0]) for weight, idx in [(-1, a), (1, b), (1, c)]: mean += weight * ok_syn0[idx] mean /= np.sqrt(np.sum(mean**2)) dists = np.dot(ok_syn0, mean) correct = False for proj_idx in np.argsort(dists)[::-1]: if proj_idx not in ignore: if proj_idx == expected: correct = True break return section, correct def log_section((section, correct, all_qs)): logger.info("%s: %.1f%% (%i/%i)", section, 100. * correct / all_qs, correct, all_qs) summary = [] for section, answers in groupby(pool.imap(check, questions), key=itemgetter(0)): answers = list(answers) correct = sum(answer for _, answer in answers) all_qs = len(answers) summary.append((section, correct, all_qs)) log_section(summary[-1]) total_correct = sum(t[1] for t in summary) total_all_qs = sum(t[2] for t in summary) summary.append(('total', total_correct, total_all_qs)) log_section(summary[-1]) return summary
def get_all_links_private(self): pool = ThreadPool(64) list(pool.imap(self.handler_all_links_public, self.__private))
def get_data_private(self): pool = ThreadPool(64) list(pool.imap(self.handler_data_private, self.__schools_links_private))
class Pipeline(object): """ A pipeline of multiple processors to process S3 objects. """ def __init__(self, access_key=None, secret_key=None, dry_run=False, threads=None): self._pipeline = [] self.access_key = access_key self.secret_key = secret_key self.dry_run = dry_run self.threads = threads self.pool = ThreadPool(threads) self.thread_local_buckets = threading.local() def append(self, analyser, pattern, ignore_case=True): if ignore_case: pattern = re.compile(pattern, flags=re.IGNORECASE) else: pattern = re.compile(pattern) self._pipeline.append((pattern, analyser)) def analyse(self, pattern, ignore_case=True): def decorator(func): self.append(DecoratorAnalyser(func.__name__, func), pattern, ignore_case) return func return decorator def connect_s3(self): if self.access_key is not None and self.secret_key is not None: return boto.connect_s3(aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key) else: return boto.connect_s3() def get_bucket(self, name): if getattr(self.thread_local_buckets, name, None) is None: logging.debug('Create new connection to S3 from thread %s', threading.currentThread()) conn = self.connect_s3() bucket = conn.get_bucket(name) setattr(self.thread_local_buckets, name, bucket) return getattr(self.thread_local_buckets, name) def run(self, bucket, prefix='', show_progress=True): self.pre_run() bucket = self.get_bucket(bucket) keys = bucket.list(prefix) chunk_size = self.threads if self.threads is not None else cpu_count() it = self.pool.imap(self.analyse_key, keys, chunksize=chunk_size) if show_progress: list(progress.dots(it, label='Analysing bucket "%s"' % bucket.name)) else: list(it) self.post_run() def pre_run(self): for _, analyser in self._pipeline: analyser.start() def post_run(self): for _, analyser in self._pipeline: analyser.finish() def analyse_key(self, key): bucket = self.get_bucket(key.bucket.name) for pattern, analyser in self._pipeline: if pattern.match(key.key): # update key metadata since last analyser might already modified it key = bucket.get_key(key.key) analyser.analyse(key, dry_run=self.dry_run)
class Mininote(object): """Provides access to the Evernote 'database'.""" def __init__(self, token, notebook_guid=None): """ :param str token: The Evernote auth token :param str notebook_guid: The Evernote notebook GUID or None if not known """ client = EvernoteClient(token=token, consumer_key=EVERNOTE_CONSUMER_KEY, consumer_secret=EVERNOTE_CONSUMER_SECRET, sandbox=DEVELOPMENT_MODE) self._token = token self._note_store_uri = client.get_user_store().getNoteStoreUrl() self._thread_pool = ThreadPool(processes=EVERNOTE_FETCH_THREADS) self.notebook_guid = notebook_guid or self._get_create_notebook() def _note_store(self): """ :returns: new NoteStore instance """ return Store(self._token, NoteStore.Client, self._note_store_uri) def add_note(self, note): """ :param Note note: The mininote Note instance """ logger.debug('add note: {}'.format(note.text)) self._note_store().createNote(convert_to_enote(note, self.notebook_guid)) def search(self, string): """ :param str string: The Evernote search query string :returns: An iterator to retrieve notes """ def get_page(start): result_spec = NotesMetadataResultSpec(includeTitle=True, includeCreated=True, includeUpdated=True, includeContentLength=True) return self._note_store().findNotesMetadata(note_filter, start, EVERNOTE_MAX_PAGE_SIZE, result_spec) def iter_note_metadata(note_filter): i = 0 while True: time0 = time.time() page = get_page(i) logger.debug('Page fetch time: {}'.format(time.time() - time0)) for note_metadata in page.notes: yield note_metadata i += len(page.notes) if i >= page.totalNotes: break def fetch_note(note_metadata): if note_metadata.contentLength > CONTENT_FETCH_THRESHOLD: note = self._note_store().getNote(note_metadata.guid, True, False, False, False) else: note = None return convert_to_mininote(note_metadata, note) note_filter = NoteFilter(words=string, ascending=True, order=NoteSortOrder.UPDATED, notebookGuid=self.notebook_guid) return self._thread_pool.imap(fetch_note, iter_note_metadata(note_filter)) def update_note(self, note): """ :param Note note: The mininote Note instance """ logger.debug('update_note: {}'.format(note)) self._note_store().updateNote(convert_to_enote(note, self.notebook_guid)) def delete_note(self, note): """ :param Note note: The mininote Note instance """ logger.debug('delete note: {}'.format(note)) self._note_store().deleteNote(note.guid) def _get_create_notebook(self): """ Get or create the Evernote notebook. :returns: Notebook guid """ for notebook in self._note_store().listNotebooks(): if notebook.name == EVERNOTE_NOTEBOOK: return notebook.guid return self._note_store() \ .createNotebook(Notebook(name=EVERNOTE_NOTEBOOK)) \ .guid
def parallel(func, source, chunksize=0, numcpus=multiprocessing.cpu_count()): if chunksize: source = chunk(source, chunksize) p = ThreadPool(numcpus) for i in p.imap(func, source): yield i
class ParallelFileWriter(object): def __init__(self, fileobj, compresslevel=9, n_threads=1): self.fileobj = fileobj self.compresslevel = compresslevel self.n_threads = n_threads # Initialize file state self.size = 0 self._init_state() self._write_header() # Parallel initialization self.buffers = [] self.buffer_length = 0 self.pool = ThreadPool(n_threads) self.compress_queue = Queue(maxsize=n_threads) self._consumer_thread = threading.Thread(target=self._consumer) self._consumer_thread.daemon = True self._consumer_thread.start() def tell(self): return self.size def write(self, data): if not isinstance(data, bytes): data = memoryview(data) n = len(data) if n > 0: self._per_buffer_op(data) self.size += n self.buffer_length += n self.buffers.append(data) if self.buffer_length > self._block_size: self.compress_queue.put(self.buffers) self.buffers = [] self.buffer_length = 0 return n def _consumer(self): with closing(self.pool): for buffers in self.pool.imap(self._compress, iter(self.compress_queue.get, None)): for buf in buffers: if len(buf): self.fileobj.write(buf) def _compress(self, in_bufs): out_bufs = [] compressor = self._new_compressor() for data in in_bufs: out_bufs.append(compressor.compress(data)) out_bufs.append(self._flush_compressor(compressor)) return out_bufs def close(self): if self.fileobj is None: return # Flush any waiting buffers if self.buffers: self.compress_queue.put(self.buffers) # Wait for all work to finish self.compress_queue.put(None) self._consumer_thread.join() # Write the closing bytes self._write_footer() # Flush fileobj self.fileobj.flush() # Cache shutdown state self.compress_queue = None self.pool = None self.fileobj = None
class SiteCheckProcessManager(Thread, SiteCheckerController): MEM_MINIMUM_REQ = 100 def __init__(self, job_name: str="", input_Q:multiprocessing.Queue=None, max_procss=4, concurrent_page=1, page_max_level=10, max_page_per_site=1000, output_delegate=None, memory_limit_per_process=100, **kwargs): """ :param job_name: :param input_Q: :param max_procss: :param concurrent_page: :param page_max_level: :param max_page_per_site: :param output_delegate: :param memory_limit_per_process: if value is less than 100, throw ValueException :param kwargs: :return: """ Thread.__init__(self) #FeedbackInterface.__init__(**kwargs) #super(SiteCheckProcessManager, self).__init__(**kwargs) #self.process_queue = multiprocessing.Queue() self.name = job_name if max_procss <= 0: max_procss = 1 self.max_prcess = max_procss if input_Q is None: self.inputQueue = multiprocessing.Queue() else: self.inputQueue = input_Q self.outputQueue = multiprocessing.Queue() self._whoisQueue = multiprocessing.Queue() #self.output_lock = threading.RLock() #self.tempList = site_list # if there is a need to add new sites during scripting, add to this list self.processPrfix = "Process-" self.threadPrfix = "Thread-" self.page_max_level = page_max_level self.max_page_per_site = max_page_per_site if output_delegate is None: self.output_delegate = self.default_delegate else: self.output_delegate = output_delegate # delegate of type f(x:OnSiteLink) self.stop_event = multiprocessing.Event() self.finished = False self.pool = ThreadPool(processes=self.max_prcess) #self.pool = multiprocessing.Pool(processes=self.max_prcess) self.output_thread = None self.job_all = 0 self.job_done = 0 self.job_waiting = 0 self.total_page_done = 0 self.page_per_sec = 0 # need to do this self.average_page_per_site = 0 self.patch_limit = self.max_prcess self.temp_results = [] self.site_info = [] # collect site info after the job done self.db_trash_list = [] self.concurrent_page = concurrent_page self.continue_lock = threading.RLock() self.db_trash_lock = threading.RLock() self.state_lock = threading.RLock() self.temp_result_lock = threading.RLock() self.site_info_lock = threading.RLock() if memory_limit_per_process < SiteCheckProcessManager.MEM_MINIMUM_REQ: ex = ValueError("minimum memory requirement to run the crawler is 100 MB, otherwise too many memory control looping.") msg = "error in SiteCheckProcessManager.__init__(), with database: " + job_name ErrorLogger.log_error("SiteCheckProcessManager", ex, msg) self.memory_limit_per_process = memory_limit_per_process self.whois_process = None self.whois_queue_process = Process(target=run_queue_server) #self.input_iter = SiteInputIter(self.inputQueue, self, self.concurrent_page, self.page_max_level, # self.max_page_per_site, self.outputQueue, self.process_site_info) self.input_iter = SiteInputIter(self.inputQueue, func=site_check_process, external_stop=self.stop_event) def _create_all_file_dirs(self): try: FileHandler.create_file_if_not_exist(get_log_dir()) FileHandler.create_file_if_not_exist(get_recovery_dir_path()) FileHandler.create_file_if_not_exist(get_temp_db_dir()) FileHandler.create_file_if_not_exist(get_task_backup_dir()) FileHandler.create_file_if_not_exist(get_db_buffer_default_dir()) except Exception as ex: ErrorLogger.log_error("SiteCheckProcessManager", ex, "_create_all_file_dirs()") def clear_cache(self): try: FileHandler.clear_dir(get_log_dir()) FileHandler.clear_dir(get_recovery_dir_path()) FileHandler.clear_dir(get_temp_db_dir()) FileHandler.clear_dir(get_task_backup_dir()) FileHandler.clear_dir(get_db_buffer_default_dir()) except Exception as ex: ErrorLogger.log_error("SiteCheckProcessManager", ex, "clear_cache()") def set_system_limit(self): try: os.system('sudo -s') os.system('ulimit -n 204800') # os.system('ulimit -s 1024') except Exception as ex: print(ex) def get_temp_result_count(self): #with self.temp_result_lock: return len(self.temp_results) def get_temp_result_and_clear(self) -> []: with self.temp_result_lock: copied = self.temp_results.copy() self.temp_results.clear() return copied def default_delegate(self, result): with self.temp_result_lock: if isinstance(result, OnSiteLink): self.temp_results.append(result) # make no difference #CsvLogger.log_to_file("ExternalSiteTemp", [(result.link, result.response_code), ]) elif isinstance(result, str): self.temp_results.append(result) elif isinstance(result, tuple) and len(result) == 2: temp = OnSiteLink(result[0], result[1]) print("new domain:", temp) self.temp_results.append(temp) else: pass def get_state(self) -> SiteCheckProcessState: print("get state from slave crawler") with self.state_lock: state = SiteCheckProcessState(self.job_all, self.job_done, self.job_waiting, self.total_page_done, self.average_page_per_site, self.get_temp_result_count()) print("get state from slave crawler finished") return state def get_filter_progress(self): if isinstance(self.whois_process, MemoryControlPs): state = self.whois_process.get_last_state() if isinstance(state, WhoisCheckerState): return state.progress_count, state.data_total else: return 0, 0 else: return 0, 0 def clear_trash(self): # run with a thread while not self.stop_event.is_set(): with self.db_trash_lock: removed_list = [] trash_len = len(self.db_trash_list) if trash_len > 0: for item in self.db_trash_list: if TempDBInterface.force_clear(item): #print("removed trash:", item) removed_list.append(item) for removed_item in removed_list: self.db_trash_list.remove(removed_item) CsvLogger.log_to_file("job_finished", [(x, str(datetime.datetime.now())) for x in removed_list], get_task_backup_dir()) removed_list.clear() time.sleep(2) def put_to_input_queue(self, data: []): if data is not None: for item in data: self.inputQueue.put(item) self.job_all += 1 def get_site_info_list_and_clear(self): with self.site_info_lock: copied = self.site_info.copy() self.site_info.clear() return copied def get_site_info_list_count(self): return len(self.site_info) def process_site_info(self, site_info): if site_info is not None: with self.site_info_lock: PrintLogger.print("finished site info: " + str(site_info.__dict__)) self.site_info.append(site_info) def process_feedback(self, feedback: SiteFeedback): self.add_page_done(feedback.page_done) if feedback.finished: # print("should process feedback!") self.site_finished() self.process_site_info(feedback.seed_feedback) with self.db_trash_lock: self.db_trash_list.append(feedback.datasource_ref) self.db_trash_list.append(feedback.datasource_ref+".ext.db") def add_page_done(self, number_page_done: int): # make sure it is thread safe with self.state_lock: self.total_page_done += number_page_done time.sleep(0.001) def site_finished(self): # print("one more site done") with self.state_lock: self.job_done += 1 self.average_page_per_site = self.total_page_done/self.job_done time.sleep(0.001) def set_stop(self): self.stop_event.set() def can_continue(self): return not self.stop_event.is_set() def checking_whois(self): optinmal = self.max_prcess * self.concurrent_page/5 if optinmal < 10: worker_number = 10 else: worker_number = int(optinmal) mem_limit = self.memory_limit_per_process/2 if mem_limit < 200: mem_limit = 200 self.whois_process = MemoryControlPs(whois_process, func_kwargs=WhoisChecker.get_input_parameters(self._whoisQueue, self.outputQueue, self.stop_event, worker_number), mem_limit=mem_limit, external_stop_event=self.stop_event) self.whois_process.start() def queue_failure_reset(self): manager, self.outputQueue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output) return self.outputQueue def run(self): # self.set_system_limit() self._create_all_file_dirs() self.whois_queue_process.start() whois_thread = Thread(target=self.checking_whois) trash_clean_thread = Thread(target=self.clear_trash) manager, self.outputQueue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output) # self.output_thread = outputThread(0, self.threadPrfix+"Output", self.stop_event, self.outputQueue, # delegate=self.output_delegate, failsure_reset_queue=self.queue_failure_reset) self.output_thread = outputThread(threadID=0, name=self.threadPrfix+"Output", stop_event=self.stop_event, inputQ=self.outputQueue, delegate=self.output_delegate, failsure_reset_queue=self.queue_failure_reset) self.output_thread.start() trash_clean_thread.start() whois_thread.start() # self.whois_queue_process.start() self.input_iter.func_kwarg = SiteThreadChecker.get_input_parameter(full_link="", # this parameter will be updated in self.input_iter max_page=self.max_page_per_site, max_level=self.page_max_level, output_queue=self._whoisQueue, pool_size=self.concurrent_page) self.input_iter.callback = self.process_feedback self.input_iter.Memlimit = self.memory_limit_per_process try: #print("monitor process started: pid: ", os.getpid()) self.pool.imap(site_check_process_iter, self.input_iter, 1) #self.pool.imap_unordered(site_check_process_iter, self.input_iter) while self.can_continue(): time.sleep(0.5) except Exception as ex: msg = "run(), with database: " + self.name ErrorLogger.log_error("SiteCheckProcessManager", ex, msg) finally: print("terminate miner!") self.pool.terminate() whois_thread.join() self.whois_queue_process.terminate() self.temp_results.clear() self.site_info.clear() self.finished = True
def download_sequence(output_folder, mpy_token, sequence, username, c, nb_sequences): global _DOWNLOAD_SEQUENCE_SIZE global _DOWNLOAD_TOTAL_SIZE subfolder_enabled = SUBFOLDER if DEBUG >= 3: print(" Prepare sequence download") if DEBUG >= 4: pprint(sequence) sequence_name = ( sequence["properties"]["captured_at"] + "_" + sequence["properties"]["created_at"] ) if os.name == "nt": sequence_name = sequence_name.replace(":", "_") sequence_day = sequence_name.split("T")[0] sorted_folder = output_folder + "/" + sequence_day if subfolder_enabled == 1: subfolder = sequence["properties"]["captured_at"] if os.name == "nt": subfolder = subfolder.replace(":", "_") sorted_folder = sorted_folder + "/" + subfolder download_list = [] os.makedirs(sorted_folder, exist_ok=True) # First pass on image_keys : sorts which one needs downloading image_keys = sequence["properties"]["coordinateProperties"]["image_keys"] for image_index, image_key in enumerate(image_keys, 1): sorted_path = ( sorted_folder + "/" + sequence_name + "_" + "%04d" % image_index + ".jpg" ) if not os.path.exists(sorted_path): download_list.append(image_key) elif os.stat(sorted_path).st_size == 0: download_list.append(image_key) if not download_list: if DEBUG >= 2: print(" Sequence %r already fully downloaded" % sequence_name) return 0, 0 already_downloaded = len(image_keys) - len(download_list) if already_downloaded: if DEBUG >= 1: print(" Already downloaded: %d/%d" % (already_downloaded, len(image_keys))) if DRY_RUN: return 1, len(download_list) # Third pass, download if entry is found in dict sequence_dl_retries = 0 update_urls = True while download_list and not sequence_dl_retries >= SEQUENCE_DL_MAX_RETRIES: if update_urls: source_urls = get_source_urls(download_list, mpy_token, username) update_urls = False sequence_dl_retries += 1 # show only on a retry if sequence_dl_retries > 1 and DEBUG >= 1: print("sequence download retries: %s/%s" % (sequence_dl_retries, SEQUENCE_DL_MAX_RETRIES)) if len(download_list) > len(source_urls): print( " Missing %d/%d images, will refresh and retry later" % (len(download_list) - len(source_urls), len(download_list)) ) # if we get nothing wait a little bit if len(download_list) - len(source_urls) == len(download_list): if DEBUG >= 1: print(" Wait a second due long missing source list") time.sleep(2) sequence_dl_retries -= 1 # refresh list after this pass update_urls = True pool = ThreadPool(NUM_THREADS) pool_args = [] for image_index, image_key in enumerate(image_keys, 1): if image_key in download_list: sorted_path = ( sorted_folder + "/" + sequence_name + "_" + "%04d" % image_index + ".jpg" ) if image_key in source_urls: source_url = source_urls[image_key] pool_args.append((image_key, sorted_path, source_url)) if DEBUG >= 3: print(" Filling download pool done") try: for i, image_key in enumerate(pool.imap(download_file, pool_args), 1): if image_key: download_list.remove(image_key) print( " Downloading images #%03d out of %03d round: %d" % (i, len(pool_args), sequence_dl_retries), end="\r", flush=True, ) except SSLException as e: print(e) except DownloadException as e: print(e) except URLExpireException as e: print(e) sequence_dl_retries -= 1 # refresh urls update_urls = True finally: pool.terminate() pool.join() print(" Done sequence %r (%d/%d) %3.1f MB, camera: %s" % (sequence_name, c, nb_sequences, _DOWNLOAD_SEQUENCE_SIZE/1024/1024, sequence["properties"]["camera_make"]), flush=True) _DOWNLOAD_TOTAL_SIZE += _DOWNLOAD_SEQUENCE_SIZE _DOWNLOAD_SEQUENCE_SIZE = 0 return 1, len(source_urls)