def get_context_data(self, **kwargs): context = super(HomePageView, self).get_context_data(**kwargs) context['courses_slides'] = chunked(context['homepage'].promoted_courses.all(), 3) context['menthors_slides'] = chunked(context['homepage'].promoted_menthors.all(), 3) context['promoted_portfolios'] = Portfolio.objects.filter( home_published=True, status='published').order_by('-timestamp')[:8] return context
def group_by_magnitude(collection): alen = len(collection) if alen > 1000: return chunked(collection, 100) if alen > 100: return chunked(collection, 10) return [collection]
def _iter_cores(cores, ncontainer): full_cores, part_cores = cores.get('full', []), cores.get('part', []) if not (full_cores or part_cores): return (([], []) for _ in range(ncontainer)) return izip_longest( chunked(full_cores, len(full_cores)/ncontainer), chunked(part_cores, len(part_cores)/ncontainer), fillvalue=[] )
def chunked_join(iterable, int1, int2, str1, str2, func): """Chunk and join.""" chunks = list(chunked(iterable, int1)) logging.debug(chunks) groups = [list(chunked(chunk, int2)) for chunk in chunks] logging.debug(groups) return str1.join([ str2.join([func(''.join(chunk)) for chunk in chunks]) for chunks in groups ])
def parse_obj(self,obj,dtype): dic = OD((('type','Feature'),('geometry',OD()),('properties',OD()))) dic['properties']['class'] = dtype for child in obj: ctag = self.clip_tag(child.tag) if ctag in ['pos','area','loc']: if ctag == 'area': dic['geometry']['type'] = 'Polygon' dic['geometry']['coordinates'] = self.get_polygon_coord(child) else: if ctag == 'pos': dic['geometry']['type'] = 'Point' elif ctag == 'loc': dic['geometry']['type'] = 'LineString' i = "" for l in child.itertext(): i += l l = list(chunked(i.strip().split(),2)) i = [[float(xy[1]),float(xy[0])] for xy in l] if len(i) == 1: dic['geometry']['coordinates'] = i[0] else: dic['geometry']['coordinates'] = i elif not child.text.strip() == '': dic['properties'][ctag]=child.text else: i = '' for l in child.itertext(): i += l dic['properties'][ctag]=i.strip() dic = self.chk_types(dic) return dic
def __init__(self, recs): self.argslist = [] # TODO make these separate nodes rec_values = (rec.value for rec in recs) for name, value, type_ in chunked(rec_values, 3): self.argslist.append((name, value, type_))
def decl(self): logging.debug(_('args: %s'), self.args) args = self.args.strip().replace('__user ', '').split(',') logging.debug(_('args: %s'), args) args = [''.join(pair) for pair in chunked(args, 2)] return 'long {}({});'.format( self.name.strip(), ', '.join(args))
def cooccurrence( corpus, execnet_hub, targets, context, paths_progress_iter, output=('o', 'space.h5', 'The output space file.'), ): """Build the co-occurrence matrix.""" if targets.index.nlevels > 1: targets.sortlevel(inplace=True) if context.index.nlevels > 1: context.sortlevel(inplace=True) def init(channel): channel.send( ( 'data', pickle.dumps( { 'kwargs': { 'targets': targets, 'context': context, }, 'instance': corpus, 'folder_name': 'cooccurrence', }, ) ) ) results = execnet_hub.run( remote_func=sum_folder, iterable=paths_progress_iter, init_func=init, ) results = ([r] for r in results if r is not None) result = next(results)[0] for i, chunk in enumerate(chunked(results, 100)): logger.info('Received result chunk #%s.', i) chunked_result = [c[0] for c in chunk] with Timer() as timed: result = pd.concat( chunked_result + [result], copy=False, ).groupby(level=result.index.names).sum() logger.info( 'Computed the result by merging a chunk of received results and the result in %.2f seconds.', timed.elapsed, ) result = result.to_frame('count') result.reset_index(inplace=True) write_space(output, context, targets, result)
def add_to_spotify(db, spotify, album, original_artist, original_album): album = spotify.album(album["uri"]) tracks = album["tracks"] track_ids = [t["uri"] for t in tracks["items"]] while tracks["next"]: tracks = spotify.next(tracks) track_ids.extend(t["uri"] for t in tracks["items"]) click.echo("Adding {0} tracks to Spotify...".format(len(track_ids))) for chunk in chunked(track_ids, 50): response = spotify.current_user_saved_tracks_add(chunk) if response is not None: click.secho("F**k, something broke:") pprint(response) click.confirm("Continue?", abort=True) return cursor = db.cursor() cursor.execute( """UPDATE collection SET complete = 1 WHERE artist = ? AND album = ?""", [original_artist, original_album], ) db.commit() click.secho("Done ", fg="green", nl=False) time.sleep(0.25)
def start(experiment_description, agent, environment, results_descriptor): """Kick off the execution of an experiment.""" initialize_results(results_descriptor) interval_results = islice(interval_results_generator(agent, environment, experiment_description), experiment_description.num_steps) results_interval_chunks = chunked(interval_results, results_descriptor.interval) for chunk in results_interval_chunks: results = [interval_data.results for interval_data in chunk] write_results(merge_results(results), results_descriptor)
def parallelize_func(iterable, func, chunksz=1, n_jobs=16, *args, **kwargs): """ Parallelize a function over each element of an iterable. """ chunker = func chunks = more_itertools.chunked(iterable, chunksz) chunks_results = Parallel(n_jobs=n_jobs, verbose=50)( delayed(chunker)(chunk, *args, **kwargs) for chunk in chunks) results = more_itertools.flatten(chunks_results) return list(results)
def get_random_logs(self, limit): count = min(limit, self.db.count()) ids = self.db.find({}, {'_id': 1}) rand_ids = [r['_id'] for r in random.sample(list(ids), count)] for rand_ids_chunk in chunked(rand_ids, 100): query = {'_id': {'$in': rand_ids_chunk}} for doc in self.db.find(query, {'message': 1}): yield doc['message']
def create_partials(self, product, branch, platform, locales, revision, chunk_name=1): """Calculates "from" and "to" MAR URLs and calls create_task_graph(). Currently "from" MAR is 2 releases behind to avoid duplication of existing CI partials. :param product: capitalized product name, AKA appName, e.g. Firefox :param branch: branch name (mozilla-central) :param platform: buildbot platform (linux, macosx64) :param locales: list of locales :param revision: revision of the "to" build :param chunk_name: chunk name """ # TODO: move limit to config # Get last 5 releases (including current), # generate partial for 4 latest last_releases = self.balrog_client.get_releases(product, branch)[:5] release_to = last_releases.pop(0) per_chunk = 5 for update_number, release_from in enumerate(last_releases, start=1): log.debug("From: %s", release_from) log.debug("To: %s", release_to) for n, chunk in enumerate(chunked(locales, per_chunk), start=1): extra = [] for locale in chunk: try: build_from = self.balrog_client.get_build( release_from, platform, locale) log.debug("Build from: %s", build_from) build_to = self.balrog_client.get_build( release_to, platform, locale) log.debug("Build to: %s", build_to) from_mar = build_from["completes"][0]["fileUrl"] to_mar = build_to["completes"][0]["fileUrl"] extra.append({ "locale": locale, "from_mar": from_mar, "to_mar": to_mar, }) except (requests.HTTPError, ValueError): log.exception( "Error getting build, skipping this scenario") if extra: if len(locales) > per_chunk: # More than 1 chunk subchunk = n else: subchunk = None all_locales = [e["locale"] for e in extra] log.info("New Funsize task for %s", all_locales) self.submit_task_graph( branch=branch, revision=revision, platform=platform, update_number=update_number, chunk_name=chunk_name, extra=extra, subchunk=subchunk) else: log.warn("Nothing to submit")
def c_layout(i, definition, template): c_name = layer_names[i] pretty_name = c_name.strip('_').capitalize() layout = d['layout'] surround = lambda s: ''.join(interleave_longest(['│']*(len(s)+1), s)) layer = list(map(uni, definition)) layer[41] = layer[41].center(11) layer = chunked(layer, 12) rows = intersperse(mid, map(surround, layer)) pretty = '\n'.join(itertools.chain([top], rows, [bottom])) surround = lambda s: ', '.join(s) layer = list(map(lambda k: layer_name.get(k, k), definition)) layer = chunked(layer, 12) rows = map(surround, layer) c_layer = ',\n '.join(itertools.chain([], rows, [])) return template.format(pretty_name, pretty, c_name, layout, c_layer)
def score(self, rev_ids, caches=None, cache=None): if isinstance(rev_ids, int): rev_ids = [rev_ids] batches = batch_rev_caches(chunked(rev_ids, self.batch_size), caches, cache) for batch_scores in self.scores_ex.map(self._score_batch, batches): for score in batch_scores: yield score
def main(args): # get the arguments method = args.method win_size = args.win_size step = args.step metric_name = args.metric_name n_jobs = args.workers # Load the data. L, H, olddf, newdf = pickle.load(open(args.filename)) words = pd.Series(olddf.word.values.ravel()).unique() oldrows = [] newrows = [] sourcexrange = np.arange(args.mint, args.maxt, step) destxrange = np.arange(args.mint, args.maxt, step) if method == 'win': sourcexrange = sourcexrange[win_size:] destxrange = destxrange[:-win_size] if args.interpolate: sourcexinter = np.arange(sourcexrange[0], sourcexrange[-1] + 1, 1) destxinter = np.arange(destxrange[0], destxrange[-1] + 1, 1) else: sourcexinter = sourcexrange destxinter = destxrange # Construct the series assert(len(sourcexinter) == len(destxinter)) chunk_sz = np.ceil(len(words)/float(n_jobs)) words_chunks = more_itertools.chunked(words, chunk_sz) timeseries_chunks = Parallel(n_jobs=n_jobs, verbose=20)(delayed(process_chunk)(chunk, create_word_time_series, olddf, newdf, sourcexinter, destxinter, metric_name=metric_name, interpolate=args.interpolate) for chunk in words_chunks) timeseries = list(more_itertools.flatten(timeseries_chunks)) # Dump the data frame for orow, newrow in timeseries: if orow and newrow: oldrows.append(orow) newrows.append(newrow) oldtimeseries = pd.DataFrame() newtimeseries = pd.DataFrame() header = ['word'] header.extend(sourcexinter) newheader = ['word'] newheader.extend(destxinter) oldtimeseries = oldtimeseries.from_records(oldrows, columns=header) oldtimeseries = oldtimeseries.fillna(method='backfill', axis=1) newtimeseries = newtimeseries.from_records(newrows, columns=newheader) newtimeseries = newtimeseries.fillna(method='backfill', axis=1) oldtimeseries.to_csv(args.sourcetimef, encoding='utf-8') newtimeseries.to_csv(args.endtimef, encoding='utf-8')
def write_results(results, results_descriptor): """Output the given results to terminal and to file.""" output_path = results_descriptor.output_path keys = results_descriptor.keys value_vectors = (results[key] for key in keys) rows = chunked(interleave(value_vectors), len(keys)) string_rows = map(lambda v: ' '.join(str(x) for x in v), rows) all_string_rows = '\n'.join(string_row for string_row in string_rows) keys_string = ' '.join(key for key in keys) output_stdout(keys_string + '\n' + all_string_rows, output_path) output_file(all_string_rows, output_path)
def update_graphs(self): """Get data from shared mp array and appends to graph if we are ready to do so""" if self.sync_event.is_set(): if self.plots_are_reset: self.arrays_plots = {self.plots[ch]: chunked([n for n in self.np_array[i] if not np.isnan(n)], 50) for i, ch in enumerate(self.ch_num) if not np.isnan(self.np_array[i][0])} self.add_point_to_graph() else: qc.QTimer.singleShot(5, self.update_graphs)
def create(cls, network_id, cidr): network = cls(network_id, cidr) ip_network = network.cidr rds.set(cls._CIDR_KEY % network_id, cidr) rds.set(cls._NETWORK_ID_KEY % cidr, network_id) key = cls._NETWORK_IPS_KEY % network_id for ipnums in more_itertools.chunked(xrange(ip_network.first, ip_network.last+1), 500): rds.sadd(key, *ipnums) return network
def get_polygon_coord(self,obj): #get exterior coords coord = [] i = "" ext = obj.find('.//gml:exterior',self.ns) for l in ext.itertext(): i += l l = list(chunked(i.strip().split(),2)) coord.append([[float(xy[1]),float(xy[0])] for xy in l]) #get interior coords inte = obj.findall('.//gml:interior',self.ns) if not inte: return coord else: for i in inte: j = "" for l in i.itertext(): j += l l = list(chunked(j.strip().split(),2)) coord.append([[float(xy[1]),float(xy[0])] for xy in l]) return coord
def main(argv=None): parser = argparse.ArgumentParser( formatter_class=WrappedTextHelpFormatter, description=DESCRIPTION.strip(), ) parser.add_argument( '--sleep', help='how long in seconds to sleep before submitting the next group', type=int, default=SLEEP_DEFAULT ) parser.add_argument('--host', help='host for system to reprocess in', default=DEFAULT_HOST) parser.add_argument('crashid', help='one or more crash ids to fetch data for', nargs='*', action=FallbackToPipeAction) if argv is None: args = parser.parse_args() else: args = parser.parse_args(argv) api_token = os.environ.get('SOCORRO_REPROCESS_API_TOKEN') if not api_token: print('You need to set SOCORRO_REPROCESS_API_TOKEN in the environment') return 1 url = args.host.rstrip('/') + '/api/Reprocessing/' print('Sending reprocessing requests to: %s' % url) session = session_with_retries() crash_ids = args.crashid print('Reprocessing %s crashes sleeping %s seconds between groups...' % ( len(crash_ids), args.sleep )) groups = list(chunked(crash_ids, CHUNK_SIZE)) for i, group in enumerate(groups): print('Processing group ending with %s ... (%s/%s)' % (group[-1], i + 1, len(groups))) resp = session.post( url, data={'crash_ids': group}, headers={ 'Auth-Token': api_token } ) if resp.status_code != 200: print('Got back non-200 status code: %s %s' % (resp.status_code, resp.content)) continue # NOTE(willkg): We sleep here because the webapp has a bunch of rate limiting and we don't # want to trigger that. It'd be nice if we didn't have to do this. time.sleep(args.sleep) print('Done!')
def test_container_release_cores(test_db): a = App.get_or_create('app', 'http://git.hunantv.com/group/app.git') v = a.add_version(random_sha1()) p = Pod.create('pod', 'pod', 10, -1) host = Host.create(p, random_ipv4(), random_string(), random_uuid(), 200, 0) for core in host.cores: assert core.host_id == host.id assert core.remain == 10 containers = [] cores = sorted(host.cores, key=operator.attrgetter('label')) for fcores, pcores in zip(chunked(cores[:100], 10), chunked(cores[100:], 10)): used_cores = {'full': fcores, 'part': pcores} host.occupy_cores(used_cores, 5) c = Container.create(random_sha1(), host, v, random_string(), 'entrypoint', used_cores, 'env', nshare=5) containers.append(c) cores = sorted(host.cores, key=operator.attrgetter('label')) for fcores, pcores in zip(chunked(cores[:100], 10), chunked(cores[100:], 10)): for core in fcores: assert core.remain == 0 for core in pcores: assert core.remain == 5 for c in containers: c.delete() cores = sorted(host.cores, key=operator.attrgetter('label')) for fcores, pcores in zip(chunked(cores[:100], 10), chunked(cores[100:], 10)): for core in fcores: assert core.remain == 10 for core in pcores: assert core.remain == 10
def fix_ip(n): network = n.network base = int(network.network_address) for ipnums in more_itertools.chunked(xrange(base+n.gateway_count, base+network.num_addresses), 500): rds.sadd(n.storekey, *ipnums) rds.sadd(n.gatekey, *range(base, base+n.gateway_count)) for ip in n.ips.all(): rds.srem(n.storekey, ip.ipnum) for gateway in n.gates.all(): rds.srem(n.gatekey, gateway.ipnum)
def create_partials(self, product, branch, platform, locales, revision, mar_urls, mar_signing_format): """Calculates "from" and "to" MAR URLs and calls create_task_graph(). Currently "from" MAR is 2 releases behind to avoid duplication of existing CI partials. :param product: capitalized product name, AKA appName, e.g. Firefox :param branch: branch name (mozilla-central) :param platform: buildbot/taskcluster platform (linux, macosx64) :param locales: list of locales :param revision: revision of the "to" build :param mar_urls: dictionary of {locale:mar file url} for each locale """ # TODO: move limit to config partial_limit = 4 per_chunk = 5 tasks = defaultdict(list) for locale in locales: to_mar = mar_urls.get(locale) log.info("Build to: %s", to_mar) latest_releases = self.get_builds( product, platform, branch, locale, to_mar, partial_limit) for update_number, build_from in enumerate(latest_releases, start=1): log.info("Build from: %s", build_from) try: from_mar = build_from['completes'][0]['fileUrl'] except ValueError as excp: log.error("Unable to extract fileUrl from %s: %s", build_from, excp) continue tasks[update_number].append({ "locale": locale, "from_mar": from_mar, "to_mar": to_mar, }) for update_number in tasks: for extra in chunked(tasks[update_number], per_chunk): all_locales = [e["locale"] for e in extra] log.info("New Funsize task for %s", all_locales) locale_desc = "_".join(all_locales) locale_desc = locale_desc.replace('-', '_') self.submit_task_graph( branch=branch, revision=revision, platform=platform, update_number=update_number, extra=extra, locale_desc=locale_desc, mar_signing_format=mar_signing_format)
def _score(self, context, model, rev_ids): logging.debug("Starting up thread pool with {0} workers" .format(self.workers)) with ThreadPoolExecutor(max_workers=self.workers) as executor: futures = [] for rev_id_batch in chunked(rev_ids, self.batch_size): rev_id_batch = list(rev_id_batch) logging.debug("Starting batch of {0} revids" .format(len(rev_id_batch))) futures.append(executor.submit(self._score_request, context, model, rev_id_batch)) for future in futures: for score in future.result(): yield score
def get(self, *args): self.set_header("Content-Type", "text/event-stream") north, south, east, west = map(float, args) start_t = datetime.now() query_range = r.polygon(r.point(west, north), r.point(west, south), r.point(east, south), r.point(east, north)) selection = r.table("streets").get_intersecting(query_range, index="geometry") initial_t = (datetime.now() - start_t).total_seconds() cursor = selection.map(r.row["geometry"].to_geojson()).run(self.conn) size = 0 for chunk in chunked(cursor, 2000): size += len(chunk) self.write_event(chunk) self.write_event("done") total_t = (datetime.now() - start_t).total_seconds() print "street query took", initial_t, "s for the first batch", print "(", total_t, "s total) and provided", size, "results."
def on_reload() -> None: """Рендерит html страницы.""" env = Environment( loader=FileSystemLoader('.'), autoescape=select_autoescape(['html', 'xml']), ) template = env.get_template('template.html') for page, chunk in enumerate(chunked(books, book_per_page), 1): # noqa: WPS221 rendered_page = template.render({ 'chunk': chunk, 'pages': pages, 'page': page }) with open(os.path.join('pages', f'index{page}.html'), 'w', encoding='utf8') as html_file: # noqa: WPS221 html_file.write(rendered_page)
def find_missing(self, num_workers, date): check_crashids_for_date = partial(check_crashids, date=date) missing = [] entropy_chunked = chunked(self.get_entropy(), CHUNK_SIZE) if num_workers == 1: for result in map(check_crashids_for_date, entropy_chunked): missing.extend(result) else: with concurrent.futures.ProcessPoolExecutor( max_workers=num_workers ) as executor: for result in executor.map( check_crashids_for_date, entropy_chunked, timeout=WORKER_TIMEOUT ): missing.extend(result) return list(missing)
def iter_examples(filename, stoi, window_size=5, batch_size=20): """ reads from file and generates batched tensor examples """ # numericalize iter_ex = (numericalize_example(e, stoi) for e in iter_data(filename, window_size)) # fill-value to pad contexts with fv = stoi[PAD_TOK] for example in mit.chunked(iter_ex, batch_size): #de-tuple words, contexts = zip(*example) # pad contexts contexts = list(zip(*it.zip_longest(*contexts, fillvalue=fv))) # create tensors word_tensor = torch.tensor(words) context_tensor = torch.tensor(contexts) yield word_tensor, context_tensor
def create(cls, name, netspace): """create network and store ips(int) under this network in redis""" try: n = cls(name, netspace) db.session.add(n) db.session.commit() # create sub IPs network = n.network base = int(network.network_address) # 一次写500个吧 for ipnums in more_itertools.chunked(xrange(base+1, base+network.num_addresses), 500): rds.sadd(n.storekey, *ipnums) return n except sqlalchemy.exc.IntegrityError: db.session.rollback() return None
def load_test(ske, block, id_seperate, batch_size): ''' To load and shuffle the test set. ''' test_block = block[id_seperate == 3] test_ske = ske[id_seperate == 3] np.random.seed(52) state = np.random.get_state() np.random.shuffle(test_block) np.random.set_state(state) np.random.shuffle(test_ske) test_ske = test_ske.flatten() test_block = test_block.reshape(-1, 3) test_ske = list(chunked(test_ske, batch_size)) return (test_ske, test_block)
def apply_parallel(data: List[Any], func: Callable) -> List[Any]: """ Apply function to list of elements. Automatically determines the chunk size. """ cpu_cores = cpu_count() try: chunk_size = ceil(len(data) / cpu_cores) pool = Pool(cpu_cores) transformed_data = pool.map(func, chunked(data, chunk_size), chunksize=1) finally: pool.close() pool.join() return transformed_data
def __init__(self, ctx, entries, *, per_page=15, title=discord.Embed.Empty, colour=None, **kwargs): super().__init__(ctx, **kwargs) self._pages = tuple(chunked(entries, per_page)) self._index = 0 if colour is None: colour = ctx.bot.colour # These should probably be removed at some point in the future. self.title = title self.colour = colour
def make_dataset(video_dir_path, video_phase_annotation_path, phase_list, sample_duration): """ Construct dataset of samples from a given video directory path. Each sample is a python dictionary containing the video path and indices of 16-frame portions from that video file, as well as the associated class label of that portion. video_phase_annotation_path file is supposed to contain a frame index and corresponding class label (surgical phase) at each of its row. :param root_path: Absolute path to the root directory of video and timestamp files. :param phase_list: List of all possible phases (classes) :param subset: training, validation, or testing. :param idx_subset: list of exact video file indices for the chosen subset. :param sample_duration: number of frames each sample contains :return: list of samples. """ class_to_idx = {phase_list[i]: i for i in range(len(phase_list))} dataset = [] df = pd.read_csv(video_phase_annotation_path, delim_whitespace=True) sample = { 'video': video_dir_path, 'video_id': os.path.basename(video_dir_path), } for phase in phase_list: df_phase = df.loc[df['Phase'] == phase] for group in consecutive_groups(df_phase['Frame'] + 1): for chunk in chunked(group, sample_duration): sample_j = copy.deepcopy(sample) sample_j['frame_indices'] = chunk sample_j['label'] = class_to_idx[phase] dataset.append(sample_j) return dataset
def make_source_data(session: sa_orm.Session, metrics: SourceDataMetrics, available_schemas: list): """ Creates source data based on metrics and available schemas :param session: SQLAlchemy session :param metrics: determines how many instances of each model to create :param available_schemas: list of dictionaries describing form schemas """ # create all the forms from the available schemas schema_iterator = factory.Iterator(available_schemas, cycle=True) forms = FormFactory.build_batch(metrics.forms, schema=schema_iterator) session.add_all(forms) session.flush() # create all the users users = UserFactory.build_batch(metrics.users) session.add_all(users) session.flush() # use the node path map cache to generate submission responses JSON get_node_path_map = transformers.get_node_path_map_cache(session) _cached_make_response = functools.partial(make_response, get_node_path_map) # Create all the submissions # # To support building a very large number of submissions, we avoid using factory_boy.Factory.build_batch() # because it constructs all instances at once in memory as a list # # Instead, we us a generator and insert 500 instances at a time submission_factory = functools.partial( SubmissionFactory, f_make_response=_cached_make_response, form=factory.Iterator(forms, cycle=True), user=factory.Iterator(users, cycle=True), ) submissions_generator = (submission_factory() for _ in range(metrics.submissions)) for chunk in more_itertools.chunked(submissions_generator, 500): session.bulk_save_objects(chunk) session.flush()
def generate_embeddings_iter( model, file_path: pathlib.Path, batch_size: int, device: torch.device, seen_set: Set[int], min_confidence: float = 0.5, ): with h5py.File(str(file_path), "r") as f: image_dset = f["image"] confidence_dset = f["confidence"] external_id_dset = f["external_id"] for slicing in chunked(range(len(image_dset)), batch_size): slicing = np.array(slicing) external_ids = external_id_dset[slicing] mask = external_ids == 0 if np.all(mask): break mask = (~mask) & (confidence_dset[slicing] >= min_confidence) for i, external_id in enumerate(external_ids): if int(external_id) in seen_set: mask[i] = 0 if np.all(~mask): continue images = image_dset[slicing][mask] images = np.moveaxis(images, -1, 1) # move channel dim to 1st dim with torch.no_grad(): torch_images = torch.tensor(images, dtype=torch.float32, device=device) embeddings = model.extract_features(torch_images).cpu().numpy() max_embeddings = np.max(embeddings, (-1, -2)) yield ( max_embeddings, external_ids[mask], )
def display_duplicates(duplicates, db, trash="./Trash/"): from werkzeug.routing import PathConverter import io class EverythingConverter(PathConverter): regex = '.*?' app = Flask(__name__) CORS(app) app.url_map.converters['everything'] = EverythingConverter def render(duplicates, current, total): env = Environment(loader=FileSystemLoader('template')) template = env.get_template('index.html') return template.render(duplicates=duplicates, current=current, total=total) with TemporaryDirectory() as folder: # Generate all of the HTML files chunk_size = 25 for i, dups in enumerate(chunked(duplicates, chunk_size)): with open('{}/{}.html'.format(folder, i), 'w') as f: f.write(render(dups, current=i, total=math.ceil(len(duplicates) / chunk_size))) webbrowser.open("file://{}/{}".format(folder, '0.html')) @app.route('/picture/<everything:file_name>', methods=['DELETE']) def delete_picture_(file_name, trash=trash): return str(delete_picture(file_name, db, trash)) @app.route('/heic-transform/<everything:file_name>', methods=['GET']) def transcode_heic_(file_name): heif_image = pyheif.read_heif(open(file_name, 'rb')) image = Image.frombytes( mode=heif_image.mode, size=heif_image.size, data=heif_image.data) encoded = io.BytesIO() image.save(encoded, format='JPEG') return Response(encoded.getvalue(), mimetype='image/jpeg') app.run()
def upload_s3(s_bucket, s_key): clnt = boto3.client('s3', region_name=DEFAULT_REGION) doc_serv = boto3.client('cloudsearchdomain', region_name=DEFAULT_REGION, endpoint_url=ENDPOINT_URL) response = clnt.get_object(Bucket=s_bucket, Key=s_key) raw_data_gz = response.get('Body').read() raw_data = zlib.decompress(raw_data_gz, 16 + zlib.MAX_WBITS) for raw_data_line in raw_data.splitlines(): json_data = json.loads(raw_data_line.decode('utf-8')) for big_chunk in more_itertools.chunked(json_data['Records'], CHUNK_SIZE): for json_event in big_chunk: doc = {} doc_id = json_event['eventID'] def search(obj, pattern): cur_obj = obj for item in pattern.split('.'): if not isinstance(cur_obj, dict): return None cur_obj = cur_obj.get(item, None) return cur_obj for cs_name, ct_name in MAPPING.items(): val = search(json_event, ct_name) if val != None: application.logger.debug( "docId[%s] Adding field CloudSearch ID: %s = %s", doc_id, cs_name, val) doc[cs_name] = val doc['raw'] = json.dumps(json_event) inbytes = dict_to_binary(doc, doc_id) try: response = doc_serv.upload_documents( contentType='application/json', documents=inbytes) except Exception as exception: e = exception string = json.dumps(doc) string = '[{{\"type\":\"add\", \"id\":\"{t}\",\"fields\":{s}}}]'.format( doc_id, string) application.logger.error(str(e.args) + "JSON: " + string) application.logger.debug('Inserting docId: %s', doc_id) application.logger.info('CloudSearch commit ok')
def sample_model(model, n , batch_size=256, smiles_column='branch_smiles'): n_loops = int(np.ceil(n / batch_size)) smiles_list, mofs, props = [], [], [] for chunk in tqdm(chunked(range(n), batch_size), total=n_loops, desc='Samples'): z = model.sample_z_prior(len(chunk)) outs = model.z_to_outputs(z) smiles_list.extend(outs['x']) mofs.extend(outs['mof']) props.extend(outs['y']) props = np.stack(props) gen_df = pd.DataFrame(smiles_list, columns=[smiles_column]) gen_df['valid'] = gen_df[smiles_column].apply(valid_smiles) for index,label in enumerate(model.vocab_mof.categories): gen_df[label] = [m[index] for m in mofs] for index, label in enumerate(model.vocab_y.labels): gen_df[label] = props[:,index] return gen_df
def parseOcrFile(self, filePath): """parseOcrFile takes an argument that specifies file's name and path. It opens that file and parses out numerals algorithmically using OcrNumeralParser.parseOcrLines(). returns a list of OcrNumeral Lists, one sublist per OCR account number.""" conditionedFileList = [] f = open(filePath, 'r') ocrNumeralsListofLists = [] #First strip newlines from each line of file for line in f: conditionedFileList.extend([line.rstrip("\n")]) ocrSeqLists = list(chunked(conditionedFileList, 4)) for ocrSeq in ocrSeqLists: resultList = self.parseOcrLines(ocrSeq) ocrNumeralsListofLists.extend([resultList]) return ocrNumeralsListofLists
def get_aws_batch_job_infos(all_job_ids, boto_config=None, missing_ok=False): if boto_config is None: boto_config = BOTO_CONFIG # ensure that the list of job ids is unique assert len(all_job_ids) == len(set(all_job_ids)) batch_client = boto3.client(service_name="batch", config=boto_config) returned_jobs = [] for batch_job_ids in more_itertools.chunked(all_job_ids, 50): batch_returned_jobs = _get_aws_batch_job_infos_for_batch( batch_job_ids, batch_client, missing_ok=missing_ok ) returned_jobs.extend(batch_returned_jobs) returned_ids = [job["jobId"] for job in returned_jobs] if not missing_ok: assert sorted(returned_ids) == sorted(all_job_ids), str(set(returned_ids) - set(all_job_ids)) + str( set(all_job_ids) - set(returned_ids) ) return returned_jobs
async def load_data(override=False): redis = await aioredis.create_redis_pool('redis://localhost', password="******") await redis.flushall() values = await redis.scard('dict:all') if values < 1000000 or override: all_english_words = load_words( path.join(getcwd(), "./scraper/utils/data/english_dictionary/wlist_match2.txt")).union( load_words(path.join(getcwd(), "./scraper/utils/data/english_dictionary/wlist_match1.txt")).union( load_words(path.join(getcwd(), "./scraper/utils/data/english_dictionary/personal_whitelist.txt")))) chunks = list(chunked(all_english_words, 10000)) for chunk in chunks: await redis.sadd('dict:all', *chunk) new_len = await redis.scard('dict:all') print(f'Database seeded with {new_len} values') else: print(f'data already loaded with {values} values') return
def batch_multiprocess_with_return( function_list, pool_results=None, n_cores=mp.cpu_count(), show_progress=True, tqdm_desc=None): """ Run a list of functions on `n_cores` (default: all CPU cores), with the option to show a progress bar using tqdm (default: shown). """ iterator = [*chunked(function_list, n_cores)] pool_results = pool_results if pool_results else [] pool = Pool(processes=n_cores) if show_progress: iterator = tqdm(iterator, desc=tqdm_desc) for func_batch in iterator: procs = [] for f in func_batch: pool.apply_async(func=f, callback=pool_results.append) pool.close() pool.join() return pool_results
def main() -> None: args = parse_arguments() os.environ["TOKENIZERS_PARALLELISM"] = "false" log.info(args) tf_counter: TCounter[str] = Counter() df_counter: TCounter[str] = Counter() with mp.Pool(args.workers, initializer=init, initargs=(args.arch, )) as pool: results = pool.imap(analyze, chunked(tqdmf(args.tsv), args.batch_size)) for tf, df in results: tf_counter += tf df_counter += df vocab = sorted(tf_counter.keys(), key=lambda x: tf_counter[x], reverse=True) for word in vocab: print(f"{word}\t{tf_counter[word]}\t{df_counter[word]}")
def main(): env = Environment(loader=FileSystemLoader('.'), autoescape=select_autoescape(['html'])) with open('static/data.json', 'r', encoding='utf8') as f: books_json = f.read() books = json.loads(books_json) parts = list(chunked(books, 10)) quantity_pages = len(parts) os.makedirs('pages', exist_ok=True) template = env.get_template('template.html') pages = writer_pages(parts, template, quantity_pages) delete_unnecessary_files(pages) server = HTTPServer(('0.0.0.0', 8000), SimpleHTTPRequestHandler) server.serve_forever()
def fisher(hits, cluster_sizes, test_args, cores): raise NotImplementedError(fisher_exact_package_error_msg) slices = [ slice(l[0], l[-1] + 1) for l in more_itertools.chunked(np.arange(hits.shape[1]), cores) ] # print("Starting fisher test") t1 = time() pvalues_partial_dfs = Parallel(cores)( delayed(_run_fisher_exact_test_in_parallel_loop)( df=hits.iloc[:, curr_slice], cluster_sizes=cluster_sizes, test_args=test_args, ) for curr_slice in slices ) # print("Took ", (time() - t1) / 60, " min") pvalues = pd.concat(pvalues_partial_dfs, axis=0).sort_index() return pvalues
def apply_parallel(func: Callable, data: List[Any], cpu_cores: int = None) -> List[Any]: if not cpu_cores: cpu_cores = cpu_count() try: chunk_size = ceil(len(data) / cpu_cores) print(chunk_size) # pool = Pool(cpu_cores) # print("pool") chunks = chunked(data, chunk_size) # print("was chunked") transformed_data = map(func, data) #chunks, chunksize=1) print(type(transformed_data)) print("data was transformed") finally: # pool.close() # pool.join() return list(transformed_data)
def check_if_spelled_right_test(): seed_redis() if redis_client.scard('mispelledwords') < 35000: wrong_words = load_words( path.join( getcwd(), "./backend/scraper/scraper_lib/test_scraper_lib/some_incorrect_words.txt" )) chunks = list(chunked(wrong_words, 10000)) print('adding words in') for chunk in chunks: redis_client.sadd('mispelledwords', *chunk) # len = redis_client.sdiff('dict:all', 'mispelledwords') # wrong_words = [word.lower() for word in wrong_words] # missed = [] # for word in wrong_words: # if redis_client.sismember("dict:all", word) and re.match('^[a-z]*$', word): # missed.append(word) assert len(redis_client.sinter('dict:all', 'mispelledwords')) == 0
def run_sending(groups, reserve_time=None): # Start sms sending group by group for group_name, group_mobiles in groups: sms_text = TEXTS[group_name] kind = 'timed(%s)' % reserve_time if reserve_time else 'instant' print 'Start sending %s sms to group(%s)(%s total)' % ( kind, group_name, len(group_mobiles)) for seq, mobiles in enumerate(chunked(group_mobiles, 100)): phones = [str(m) for m in mobiles if m] print '**The %dth group(expect %s, actual %s)' % ( seq, len(mobiles), len(phones)) if not phones: continue if reserve_time: send_reserved_sms_via_yimei(phones, sms_text, reserve_time) else: send_instant_sms_via_yimei(phones, sms_text)
def from_iterator( cls, name: str, iterator: Iterable[str], batch_size: int = 64, overwrite: bool = False, ) -> Dataset: dataset = cls(name, overwrite=overwrite) dataset.data["raw"] = Raw.from_dask_array( common.PROJDIR / name / (name + ".raw.zarr.zip"), da.concatenate([ da.from_array(np.array(chunk, dtype=np.bytes_)) for chunk in chunked(iterator, batch_size) ]), overwrite=overwrite, ) dataset.save() return dataset
def generate(out): lst = ['DRS_fail'] * 128 lst[ord('n')] = 'DRS_null' lst[ord('t')] = 'DRS_true' lst[ord('f')] = 'DRS_false' lst[ord('I')] = 'DRS_inf' lst[ord('N')] = 'DRS_nan' lst[ord('"')] = 'DRS_string' lst[ord("'")] = 'DRS_string' lst[ord('{')] = 'DRS_recursive' lst[ord('[')] = 'DRS_recursive' for c in '+-.0123456789': lst[ord(c)] = 'DRS_number' print('#ifndef JSON5EncoderCpp_decoder_recursive_select', file=out) print('#define JSON5EncoderCpp_decoder_recursive_select', file=out) print(file=out) print('// GENERATED FILE', file=out) print('// All changes will be lost.', file=out) print(file=out) print('#include <cstdint>', file=out) print(file=out) print('namespace JSON5EncoderCpp {', file=out) print('inline namespace {', file=out) print(file=out) print('enum DrsKind : std::uint8_t {', file=out) print( ' DRS_fail, DRS_null, DRS_true, DRS_false, DRS_inf, DRS_nan, DRS_string, DRS_number, DRS_recursive', file=out) print('};', file=out) print(file=out) print('static const DrsKind drs_lookup[128] = {', file=out) for chunk in chunked(lst, 8): print(' ', end='', file=out) for t in chunk: print(' ', t, ',', sep='', end='', file=out) print(file=out) print('};', file=out) print(file=out) print('} // anonymous inline namespace', sep='', file=out) print('} // namespace JSON5EncoderCpp', sep='', file=out) print(file=out) print('#endif', sep='', file=out)
def polishReads(MISMATCH_RESULT, NANOPORE_READ, TEMP_DIR, FINAL_DIR, POLISHED_READ, THREADS, PENALTY_PATH, minimapPath, poaPath, raconPath, seqkitPath): if os.path.exists(TEMP_DIR): logger.warning(f"{TEMP_DIR} existed!!") else: os.mkdir(TEMP_DIR) if os.path.exists(FINAL_DIR): logger.warning(f"{FINAL_DIR} existed!!") else: os.mkdir(FINAL_DIR) logger.info('read mismatch results') mismatchResult = pd.read_feather(MISMATCH_RESULT) logger.info('prepare for polish') mismatchResult["readStrand"] = (mismatchResult["readStrand"] ^ mismatchResult["umiStrand"]) mismatchResult.drop("umiStrand", axis=1, inplace=True) mismatchResult["readStrand"] = mismatchResult["readStrand"].astype(str) mismatchResult[ "temp"] = mismatchResult["name"] + "_" + mismatchResult["readStrand"] sameUmiReadDt = mismatchResult.groupby("qseqid")["temp"].agg( lambda x: list(x)) sameUmiReadDc = {i: [[k] for k in j] for i, j in sameUmiReadDt.items()} logger.info('start polish') umiReadDcIter = chunked(sameUmiReadDc.items(), 100) i = 0 allResults = [] with ProcessPoolExecutor(THREADS) as multiP: for umiReadDtChunk in umiReadDcIter: i += 1 allResults.append( multiP.submit(chunkPolishSeq, umiReadDtChunk, NANOPORE_READ, TEMP_DIR, FINAL_DIR, PENALTY_PATH, i, minimapPath, poaPath, raconPath)) [x.result() for x in allResults] logger.info('merge all polished reads') time.sleep(10) os.system(f""" cat {FINAL_DIR}* | {seqkitPath} seq -rp > {POLISHED_READ} && sleep 15 &&\ rm -rf {FINAL_DIR} &&\ rm -rf {TEMP_DIR} """)
def split_vcf(vcf_path: Path, chunk_size: int) -> Iterable[Path]: """ A simple utility for splitting a VCF file into chunk-sized VCF files. Note: all splits keep the original header. :param vcf_path: input vcf file. :param chunk_size: max number of records in each file. :return: paths of tmp split files. """ with vcf_path.open() as vcf_io: records, header = partition(lambda line: line.startswith('#'), vcf_io) for n, chunk in enumerate(chunked(records, chunk_size)): header = header if isinstance(header, tuple) else tuple(header) with NamedTemporaryFile(mode='w', suffix='.vcf', delete=False) as out_io: out_io.writelines(header) out_io.writelines(chunk) yield Path(out_io.name)
def read_openpose_json(filename: str) -> List[JointDescriptor]: """ Reads json files generated by OpenPose's predictor. Args: filename: String; Full path of the file to read. Output: Outputs of OpenPose usually contain 25 joints with their confidence. ``` [ JointDescriptor( x=1027.69, y=221.108, confidence=0.90927, joint=<OpenPoseJoints.Nose: 0> ) ] ``` `JointDescriptor` and `OpenPoseJoints` are both classes in this library. """ with open(filename, "rb") as f: keypoints_list = [] keypoints = json.load(f) assert (len(keypoints["people"]) == 1 ), "In all pictures, we should have only one person!" points_2d = keypoints["people"][0]["pose_keypoints_2d"] assert (len(points_2d) == 25 * 3), "We have 25 points with (x, y, c); where c is confidence." for point_index, (x, y, confidence) in enumerate(chunked(points_2d, 3)): assert x is not None, "x should be defined" assert y is not None, "y should be defined" assert confidence is not None, "confidence should be defined" keypoints_list.append( JointDescriptor(x=x, y=y, confidence=confidence, joint=OpenPoseJoints(point_index))) return keypoints_list
def add_data(self, values): logger.info("Adding data to job...") chunks = more_itertools.chunked(iterable=values, n=ClassificationJob.MAX_PAGE_SIZE) for index, chunk in enumerate(chunks): page_number = index + 1 rows = self._to_labeled_rows(chunk) is_success = self._client.request( api="Classifications", method="PopulateImport", data={ "job_id": self.id, "page": page_number, "rows": rows } ) logger.info("is_success: {}".format(is_success)) assert is_success, "Failed to add data."
def get_health_checks(self) -> Mapping[str, HealthCheck]: paginator = self.route53.get_paginator('list_health_checks') hcs = chain.from_iterable(page['HealthChecks'] for page in paginator.paginate()) batch_size = 10 # Route53 lets us get tags for at most ten resources at a time hc_batches = more_itertools.chunked(hcs, batch_size) dcp_hcs = {} for hc_batch in hc_batches: hc_batch = {hc['Id']: hc for hc in hc_batch} response = self.route53.list_tags_for_resources( ResourceType='healthcheck', ResourceIds=list(hc_batch.keys())) for tag_set in response['ResourceTagSets']: assert tag_set['ResourceType'] == 'healthcheck' for tag in tag_set['Tags']: if tag['Key'] == 'Name': hc_name = tag['Value'] hc_id = tag_set['ResourceId'] dcp_hcs[hc_name] = hc_batch[hc_id] return dcp_hcs
def upload_pics(session, pics): # Google Photosで一度にアップロードできるのは50枚までなので細切れにする # more_itertools.chunkedで、指定された数ごとに細切れにできる 便利!! group_by = 40 pics_splitted = chunked(pics, group_by) for pics_elem in pics_splitted: successed_uploads = [] # ファイルをアップロードしてトークンを得る # トークンの有効期限は1日らしいので、アップロードに1日単位の時間がかかる場合は処理を見直すかも for pict_bin, file_name, description in pics_elem: session.headers["Content-type"] = "application/octet-stream" session.headers["X-Goog-Upload-Protocol"] = "raw" session.headers["X-Goog-Upload-File-Name"] = file_name upload_token = session.post( 'https://photoslibrary.googleapis.com/v1/uploads', pict_bin) if upload_token.status_code == 200: successed_uploads.append([upload_token, description]) else: print( f'An error occured while uploading file "{file_name}". Response: {upload_token}' ) # バッチ処理(mediaItems:batchCreate)用のリクエストを作る batch_request_body = {"newMediaItems": []} for upload_token, description in successed_uploads: batch_request_body['newMediaItems'].append({ "description": description, "simpleMediaItem": { "uploadToken": upload_token.content.decode() } }) batch_request_json = json.dumps(batch_request_body) result = session.post( 'https://photoslibrary.googleapis.com/v1/mediaItems:batchCreate', batch_request_json) if result.status_code != 200: print( f'An error occured while batch creating. \nStatus code:{result.status_code} Reason: {result.reason}' )
def import_insights( predictions: Iterable[Prediction], server_domain: str, batch_size: int = 1024, ) -> int: product_store = get_product_store() imported: int = 0 prediction_batch: List[Prediction] for prediction_batch in chunked(predictions, batch_size): with db.atomic(): imported += import_insights_( prediction_batch, server_domain, automatic=False, product_store=product_store, ) return imported
def handle_sentry(self, *args, **kwargs): existing_jobs = set(scheduled_jobs()) target = set(UniqueFeed.objects.filter(muted=False).values_list( 'url', flat=True)) to_delete = existing_jobs - target if to_delete: logger.info( "Deleting {0} jobs from the scheduler".format(len(to_delete))) for job_id in to_delete: delete_job(job_id) to_add = target - existing_jobs if to_add: logger.info("Adding {0} jobs to the scheduler".format(len(to_add))) for chunk in chunked(to_add, 10000): uniques = UniqueFeed.objects.filter(url__in=chunk) for unique in uniques: unique.schedule()
def handle_sentry(self, *args, **kwargs): connection = get_redis_connection() existing_jobs = set(scheduled_jobs(connection=connection)) target = set(UniqueFeed.objects.filter(muted=False).values_list( 'url', flat=True)) to_delete = existing_jobs - target if to_delete: logger.info("deleting jobs from the scheduler", count=len(to_delete)) for job_id in to_delete: delete_job(job_id, connection=connection) to_add = target - existing_jobs if to_add: logger.info("adding jobs to the scheduler", count=len(to_add)) for chunk in chunked(to_add, 10000): uniques = UniqueFeed.objects.filter(url__in=chunk) for unique in uniques: unique.schedule()