async def download_file(_url, folder, file_id=None) -> None: create_directory(folder) if not Path(f"{folder}/{file_id}_{_url}").exists() or getsize(Path(f"{folder}/{file_id}_{_url}"))['raw'] == 0: async with semaphore, aiohttp.ClientSession(json_serialize=ujson.dumps, headers={'Connection': 'keep-alive'}) as session: async with session.get(f"https://telegra.ph/file/{_url}") as response: assert response.status == 200 path = Path(f"{folder}/{file_id}_{_url}") if parser.parse_args().compress: loop = asyncio.get_running_loop() executor = ProcessPoolExecutor() destination = append_extension(str(path), "webp") if not Path(destination).exists() or getsize(destination)['formatted'] == 0: if is_image_by_url(_url): await asyncio.gather(*[loop.run_in_executor( executor, compress_image, await response.read(), Path(destination)) if is_image_by_url(_url) else None]) else: await write_file(await response.read(), path) log(f"[download] {file_id}_{_url} — {getsize(path)['formatted']}") else: await write_file(await response.read(), path) log(f"[download] {file_id}_{_url} — {getsize(path)['formatted']}")
def _createfile(self, srcpath, srcsize, srchash): status, message = (600, "MESSAGE-createfile") fullpath = self.fullpath(srcpath) #若文件存在且哈希值相同则不传输 if os.path.exists(fullpath): if utils.filehash(fullpath) == srchash: utils.log("[服务端]文件哈希值一致:%s" % (srcpath)) status, message = (605, "文件存在,哈希一致") else: status, message = (604, "文件存在,哈希不一致") else: status, message = (603, "服务端无该文件") #发送握手信息2- self.sw.send({'status': status, 'message': message}, encode='JSON') if status == 603 or status == 604: seesize = utils.getsize(srcsize) utils.log("[服务端]正在传输文件:[%s]%s" % (seesize, srcpath)) self.sw.recvfile(fullpath, srcsize) utils.log("[服务端]文件传输完成:[%s]%s" % (seesize, srcpath)) #conn.sendall(b'SUCCESS') return True
async def main(): async with aiohttp.ClientSession(json_serialize=ujson.dumps, headers={'Connection': 'keep-alive'}) as session: async with session.get( f"https://api.telegra.ph/getPage/{parser.parse_args().link.removeprefix('https://telegra.ph/')}", params={'return_content': 'true'} ) as response: response = await response.json() old_size = getsize(parser.parse_args().folder)['raw'] start_time = datetime.now() log([ f"[info] Started at: {datetime.now()}", f"[download] {response['result']['title']}", ]) queue = response['result']['content'] files = [] while queue: curr = queue.pop() if "children" in curr and (nexts := curr["children"]) and isinstance(nexts, list): queue.extend(nexts) if isinstance(curr, dict) and (curr["tag"] == "img" or curr["tag"] == "video"): files.append(curr['attrs']['src']) urls = [filename.split('/')[-1] for filename in files[::-1]] log(f"[info] Files in telegraph page: {len(urls)}") await asyncio.gather(*[download_file( url, parser.parse_args().folder, file_id ) for file_id, url in enumerate(urls)]) size = convert_bytes(getsize(parser.parse_args().folder)['raw'] - old_size) log([ f"[download] Saved {size} to \"{parser.parse_args().folder}\"", f"[info] Time elapsed: {datetime.now() - start_time}" ])
def bench(fn, df, writer, reader, desc=''): twrite = Timer(start=1) res = writer(df, fn) if res is INFEASIBLE: # print('INFEASIBLE') return nan, nan, nan twrite.end() # print('Written with', writer) tread = Timer(start=1) rdf = reader(fn) tread.end() assert df.shape == rdf.shape, '{} != {}'.format(df.shape, rdf.shape) # assert (df.dtypes == rdf.dtypes).all(), '{}\n\n != \n\n{}'.format(df.dtypes, rdf.dtypes) return twrite.time, tread.time, getsize(fn) / 10**6
def build_route_flights(session, weeks, aircraft, routes, meal_types, verbose=0): aircraft_by_airport_pair = get_aircraft_by_airport_pair(aircraft, routes) passengers_by_airport_pair = collections.defaultdict(list) # for every week for the past 10 years now = datetime.datetime.now() sunday = now - datetime.timedelta(days=(now.weekday() + 1)) week_start = sunday - datetime.timedelta(days=7 * weeks) while week_start < sunday: if verbose: print(f'Working on {week_start} ... ', end='', flush=True) # for every route in week build_route_flight_week( session, week_start, routes, aircraft_by_airport_pair, passengers_by_airport_pair, meal_types, verbose, ) week_start += datetime.timedelta(days=7) if verbose: print(f'done.', flush=True) if verbose > 1: size = getsize(passengers_by_airport_pair) print(f'Size of passengers_by_airport_pair: {size} bytes', flush=True)
def run_benchmark(parameters): check_support(parameters, unsupported_params=["dfiles_num", "gpu_memory"]) parameters["data_file"] = parameters["data_file"].replace("'", "") parameters["optimizer"] = parameters["optimizer"] or "intel" parameters["no_ml"] = parameters["no_ml"] or False # ML specific N_RUNS = 50 TEST_SIZE = 0.1 RANDOM_STATE = 777 columns_names = [ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "QGQ", "PERNUM", "PERWT", "SEX", "AGE", "EDUC", "EDUCD", "INCTOT", "SEX_HEAD", "SEX_MOM", "SEX_POP", "SEX_SP", "SEX_MOM2", "SEX_POP2", "AGE_HEAD", "AGE_MOM", "AGE_POP", "AGE_SP", "AGE_MOM2", "AGE_POP2", "EDUC_HEAD", "EDUC_MOM", "EDUC_POP", "EDUC_SP", "EDUC_MOM2", "EDUC_POP2", "EDUCD_HEAD", "EDUCD_MOM", "EDUCD_POP", "EDUCD_SP", "EDUCD_MOM2", "EDUCD_POP2", "INCTOT_HEAD", "INCTOT_MOM", "INCTOT_POP", "INCTOT_SP", "INCTOT_MOM2", "INCTOT_POP2", ] columns_types = [ "int64", "int64", "int64", "float64", "int64", "float64", "int64", "float64", "int64", "int64", "int64", "int64", "int64", "int64", "int64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", ] etl_keys = ["t_readcsv", "t_etl", "t_connect"] ml_keys = ["t_train_test_split", "t_ml", "t_train", "t_inference"] ml_score_keys = ["mse_mean", "cod_mean", "mse_dev", "cod_dev"] if not parameters["no_pandas"]: import_pandas_into_module_namespace( namespace=run_benchmark.__globals__, mode=parameters["pandas_mode"], ray_tmpdir=parameters["ray_tmpdir"], ray_memory=parameters["ray_memory"], ) etl_times_ibis = None ml_times_ibis = None etl_times = None ml_times = None if parameters["validation"] and parameters["import_mode"] != "pandas": print( f"WARNING: validation can not be performed, it works only for 'pandas' \ import mode, '{parameters['import_mode']}' passed") if parameters["data_file"].endswith(".csv"): csv_size = getsize(parameters["data_file"]) else: print( "WARNING: uncompressed datafile not found, default value for dataset_size is set" ) # deafault csv_size value (unit - MB) obtained by calling getsize # function on the ipums_education2income_1970-2010.csv file # (default Census benchmark data file) csv_size = 2100.0 if not parameters["no_ibis"]: df_ibis, X_ibis, y_ibis, etl_times_ibis = etl_ibis( filename=parameters["data_file"], columns_names=columns_names, columns_types=columns_types, database_name=parameters["database_name"], table_name=parameters["table"], omnisci_server_worker=parameters["omnisci_server_worker"], delete_old_database=not parameters["dnd"], create_new_table=not parameters["dni"], ipc_connection=parameters["ipc_connection"], validation=parameters["validation"], etl_keys=etl_keys, import_mode=parameters["import_mode"], fragments_size=parameters["fragments_size"], ) print_results(results=etl_times_ibis, backend="Ibis", unit="s") etl_times_ibis["Backend"] = "Ibis" etl_times_ibis["dataset_size"] = csv_size if not parameters["no_ml"]: ml_scores_ibis, ml_times_ibis = ml( X=X_ibis, y=y_ibis, random_state=RANDOM_STATE, n_runs=N_RUNS, test_size=TEST_SIZE, optimizer=parameters["optimizer"], ml_keys=ml_keys, ml_score_keys=ml_score_keys, ) print_results(results=ml_times_ibis, backend="Ibis", unit="s") ml_times_ibis["Backend"] = "Ibis" print_results(results=ml_scores_ibis, backend="Ibis") ml_scores_ibis["Backend"] = "Ibis" if not parameters["no_pandas"]: df, X, y, etl_times = etl_pandas( parameters["data_file"], columns_names=columns_names, columns_types=columns_types, etl_keys=etl_keys, pandas_mode=parameters["pandas_mode"], ) print_results(results=etl_times, backend=parameters["pandas_mode"], unit="s") etl_times["Backend"] = parameters["pandas_mode"] etl_times["dataset_size"] = csv_size if not parameters["no_ml"]: ml_scores, ml_times = ml( X=X, y=y, random_state=RANDOM_STATE, n_runs=N_RUNS, test_size=TEST_SIZE, optimizer=parameters["optimizer"], ml_keys=ml_keys, ml_score_keys=ml_score_keys, ) print_results(results=ml_times, backend=parameters["pandas_mode"], unit="s") ml_times["Backend"] = parameters["pandas_mode"] print_results(results=ml_scores, backend=parameters["pandas_mode"]) ml_scores["Backend"] = parameters["pandas_mode"] if parameters["validation"] and parameters["import_mode"] == "pandas": # this should work only for pandas mode compare_dataframes(ibis_dfs=(X_ibis, y_ibis), pandas_dfs=(X, y)) return { "ETL": [etl_times_ibis, etl_times], "ML": [ml_times_ibis, ml_times] }
def check(self, force, statfile): ldir = [x[0] for x in os.walk(self.indir)] if not ut.testeos(eostest, eostest_size): print 'eos seems to have problems, should check, will exit' sys.exit(3) for process in ldir: uid = process.replace(self.indir, "") if uid == "": continue if self.process != '' and uid != self.process: continue print '%s/%s/check' % (self.yamldir, uid) if not ut.file_exist('%s/%s/check' % (self.yamldir, uid)) and not force: continue psplit = process.split('/') isana = False for a in psplit: if a == "ana": isana = True if isana: print 'is anana' print process continue All_files = glob.glob("%s/output_*.root" % (process)) if len(All_files) == 0: continue print '--------------------- ', uid print 'number of files ', len(All_files) print 'process from the input directory ', uid outdir = self.makeyamldir(self.yamldir + uid) hasbeenchecked = False nevents_tot = 0 njobsdone_tot = 0 njobsbad_tot = 0 for f in All_files: self.count = 0 if not os.path.isfile(f): print 'file does not exists... %s' % f continue jobid = f.split('_')[-1] jobid = jobid.replace(self.fext, '') userid = ut.find_owner(f) outfile = '%soutput_%s.yaml' % (outdir, jobid) if ut.file_exist( outfile) and ut.getsize(outfile) > 100 and not force: continue hasbeenchecked = True print '-----------checking root file ', f if '.root' in self.fext: nevts, check = self.checkFile_root(f, treename) status = 'DONE' if not check: status = 'BAD' if status == 'DONE': nevents_tot += nevts njobsdone_tot += 1 else: njobsbad_tot += 1 dic = { 'processing': { 'process': uid, 'jobid': jobid, 'nevents': nevts, 'status': status, 'out': f, 'size': os.path.getsize(f), 'user': userid } } print '-----------writing yaml file ', outfile with open(outfile, 'w') as outyaml: yaml.dump(dic, outyaml, default_flow_style=False) continue else: print 'not correct file extension %s' % self.fext if hasbeenchecked: cmdp = '<pre>date=%s <span class="espace"/> time=%s <span class="espace"/> njobs=%i <span class="espace"/> nevents=%i <span class="espace"/> njobbad=%i <span class="espace"/> process=%s </pre>\n' % ( ut.getdate_str(), ut.gettime_str(), njobsdone_tot, nevents_tot, njobsbad_tot, uid) stat_exist = ut.file_exist(statfile) with open(statfile, "a") as myfile: if not stat_exist: myfile.write( '<link href="/afs/cern.ch/user/h/helsens/www/style/txtstyle.css" rel="stylesheet" type="text/css" />\n' ) myfile.write( '<style type="text/css"> /*<![CDATA[*/ .espace{ margin-left:3em } .espace2{ margin-top:9em } /*]]>*/ </style>\n' ) myfile.write(cmdp) print 'date=%s time=%s njobs=%i nevents=%i njobbad=%i process=%s' % ( ut.getdate_str(), ut.gettime_str(), njobsdone_tot, nevents_tot, njobsbad_tot, uid)
def queries_modin(filename, pandas_mode, extended_functionality): data_files_names = files_names_from_pattern(filename) data_for_groupby_queries = [] data_for_join_queries = [] for f in data_files_names: if f.split("/")[-1].startswith("G1"): data_for_groupby_queries.append(f) elif f.split("/")[-1].startswith("J1"): data_for_join_queries.append(f) else: raise AttributeError(f"Unrecognized file is passed as -data_file flag argument: {f}") groupby_queries_files_number = len(data_for_groupby_queries) join_queries_files_number = len(data_for_join_queries) accepted_number_of_files_for_join_queries = [0, 1, 4] if all([groupby_queries_files_number, join_queries_files_number]): raise AttributeError( "Only one type of queries (groupby or join) can be executed during one run, but files for both queries are passed with -data_file flag" ) elif groupby_queries_files_number > 1: raise AttributeError( f"Only one file for one run is accepted for groupby queries, actually passed {groupby_queries_files_number}: {data_for_groupby_queries}" ) elif join_queries_files_number not in accepted_number_of_files_for_join_queries: raise AttributeError( f"Accepted numbers of files for join queries are {accepted_number_of_files_for_join_queries}, actually passed {join_queries_files_number}: {data_for_join_queries}" ) elif join_queries_files_number and sum("NA" in f for f in data_for_join_queries) != 1: raise FileNotFoundError( "Data files for join queries should contain file (only one) with NA component in the file name" ) queries_results_fields = ["t_run1", "chk_t_run1", "t_run2", "chk_t_run2"] if groupby_queries_files_number: print(f"loading dataset {data_for_groupby_queries[0]}") t0 = timer() x = pd.read_csv(data_for_groupby_queries[0]) x_data_file_import_time = timer() - t0 queries = { "groupby_query1": groupby_query1_modin, "groupby_query2": groupby_query2_modin, "groupby_query3": groupby_query3_modin, "groupby_query4": groupby_query4_modin, "groupby_query5": groupby_query5_modin, "groupby_query6": groupby_query6_modin, "groupby_query7": groupby_query7_modin, "groupby_query8": groupby_query8_modin, "groupby_query9": groupby_query9_modin, "groupby_query10": groupby_query10_modin, } if pandas_mode == "Modin_on_omnisci": del queries["groupby_query6"] # NotImplementedError: unsupported aggregate median del queries["groupby_query8"] # Query execution in `Modin_on_omnisci` mode # is under development del queries["groupby_query9"] # core dumped issue del queries["groupby_query10"] # core dumped issue queries_results = {x: {y: 0.0 for y in queries_results_fields} for x in queries.keys()} x_data_file_size = getsize(data_for_groupby_queries[0]) query_data_file_sizes = {x: x_data_file_size for x in queries.keys()} query_data_file_import_times = {x: x_data_file_import_time for x in queries.keys()} queries_parameters = { "x": x, "queries_results": queries_results, "extended_functionality": extended_functionality, } if join_queries_files_number: data_name = next( (f for f in data_for_join_queries if "NA" in f), None ) # gets the file name with "NA" component data_files_paths, data_files_sizes = join_to_tbls(data_name) data_files_import_times = {} data_df = {} print(f"loading dataset {[path for path in data_files_paths.values()]}") for data_id, data_path in data_files_paths.items(): t0 = timer() data_df[data_id] = pd.read_csv(data_path) data_files_import_times[data_id] = timer() - t0 print(len(data_df["x"].index), flush=True) print(len(data_df["small"].index), flush=True) print(len(data_df["medium"].index), flush=True) print(len(data_df["big"].index), flush=True) queries = { "join_query1": join_query1_modin, "join_query2": join_query2_modin, "join_query3": join_query3_modin, "join_query4": join_query4_modin, "join_query5": join_query5_modin, } queries_results = {x: {y: 0.0 for y in queries_results_fields} for x in queries.keys()} queries_parameters = { "x": data_df["x"], "ys": [data_df["small"], data_df["medium"], data_df["big"]], "queries_results": queries_results, "extended_functionality": extended_functionality, } query_data_file_sizes = { "join_query1": data_files_sizes["x"] + data_files_sizes["small"], "join_query2": data_files_sizes["x"] + data_files_sizes["medium"], "join_query3": data_files_sizes["x"] + data_files_sizes["medium"], "join_query4": data_files_sizes["x"] + data_files_sizes["medium"], "join_query5": data_files_sizes["x"] + data_files_sizes["big"], } query_data_file_import_times = { "join_query1": data_files_import_times["x"] + data_files_import_times["small"], "join_query2": data_files_import_times["x"] + data_files_import_times["medium"], "join_query3": data_files_import_times["x"] + data_files_import_times["medium"], "join_query4": data_files_import_times["x"] + data_files_import_times["medium"], "join_query5": data_files_import_times["x"] + data_files_import_times["big"], } for query_name, query_func in queries.items(): query_func(**queries_parameters) print(f"{pandas_mode} {query_name} results:") print_results(results=queries_results[query_name], unit="s") queries_results[query_name]["Backend"] = pandas_mode queries_results[query_name]["t_readcsv"] = query_data_file_import_times[query_name] queries_results[query_name]["dataset_size"] = query_data_file_sizes[query_name] return queries_results
def train(self): tStep = tf.Variable(0, trainable=False, dtype=tf.int64) tWriteSummary = tf.Variable(True, trainable=False, dtype=tf.bool) train_start_time = time.time() for res in tqdm(range(self.start_resolution_log2, self.resolution_log2 + 1), desc='Training res'): logging.info('Training %dx%d model...' % (2**res, 2**res)) res_start_time = time.time() if self.reset_opt_state_for_new_lod: self.reset_optimizers_state() self.adjust_optimizers_learning_rate(res) images_generator = iter(self.images_generators[res]) logging.info('Images generator size: {:.2f} Mbs'.format( getsize(images_generator, convert_to='Mb'))) batch_size = self.batch_sizes[str(res)] summary_writer = self.summary_writers[res] fade_in_steps = self.get_n_steps(res, FADE_IN_MODE) stabilization_steps = self.get_n_steps(res, STABILIZATION_MODE) with summary_writer.as_default(): # The first resolution doesn't use alpha parameter, # but has usual number of steps for stabilization phase if res > self.start_resolution_log2: # Fading in stage fade_in_stage_start_time = time.time() if self.clear_session_for_new_model: logging.info('Clearing session...') tf.keras.backend.clear_session() self.D_model, self.G_model, self.Gs_model = self.create_models( res, mode=FADE_IN_MODE) tStep.assign(0) tWriteSummary.assign(True) desc = f'Res {res}, fade-in steps' for step in tqdm(range(fade_in_steps), desc=desc): last_step_cond = step == (fade_in_steps - 1) alpha = compute_alpha(step, fade_in_steps) self.G_model = update_wsum_alpha(self.G_model, alpha) self.D_model = update_wsum_alpha(self.D_model, alpha) if self.use_Gs: self.Gs_model = update_wsum_alpha( self.Gs_model, alpha) write_summary = step % self.summary_every == 0 or last_step_cond tWriteSummary.assign(write_summary) if write_summary: tStep.assign(step) tf.summary.scalar('alpha', alpha, step=step) G_latents = self.generate_latents(batch_size) D_latents = self.generate_latents(batch_size) batch_images = next(images_generator) self.train_step(G_model=self.G_model, D_model=self.D_model, G_latents=G_latents, D_latents=D_latents, images=batch_images, write_summary=tWriteSummary, step=tStep) if self.use_Gs: self.smooth_net_weights(Gs_model=self.Gs_model, G_model=self.G_model, beta=self.smoothed_beta) if write_summary: summary_writer.flush() if step % self.save_model_every == 0 or last_step_cond: self.save_models(res=res, mode=FADE_IN_MODE, step=step) if step % self.save_images_every == 0 or last_step_cond: self.save_valid_images(res, step, stage=FADE_IN_MODE) if self.use_Gs: self.save_valid_images(res, step, stage=FADE_IN_MODE, smoothed=True) self.update_models_weights() remove_old_models( self.model_name, res, stage=FADE_IN_MODE, max_models_to_keep=self.max_models_to_keep, storage_path=self.storage_path) fade_in_stage_total_time = time.time( ) - fade_in_stage_start_time logging.info( f'Fade-in stage took {fade_in_stage_total_time:.3f} seconds' ) # Stabilization stage stabilization_stage_start_time = time.time() if self.clear_session_for_new_model: logging.info('Clearing session...') tf.keras.backend.clear_session() self.D_model, self.G_model, self.Gs_model =\ self.create_models(res, mode=STABILIZATION_MODE) tStep.assign(fade_in_steps) tWriteSummary.assign(True) desc = f'Res {res}, stabilization steps' for step in tqdm(range(stabilization_steps), desc=desc): last_step_cond = step == (stabilization_steps - 1) write_summary = step % self.summary_every == 0 or last_step_cond tWriteSummary.assign(write_summary) if write_summary: tStep.assign(step + fade_in_steps) G_latents = self.generate_latents(batch_size) D_latents = self.generate_latents(batch_size) batch_images = next(images_generator) self.train_step(G_model=self.G_model, D_model=self.D_model, G_latents=G_latents, D_latents=D_latents, images=batch_images, write_summary=tWriteSummary, step=tStep) if self.use_Gs: self.smooth_net_weights(Gs_model=self.Gs_model, G_model=self.G_model, beta=self.smoothed_beta) if write_summary: summary_writer.flush() if step % self.save_model_every == 0 or last_step_cond: self.save_models(res=res, mode=STABILIZATION_MODE, step=step) if step % self.save_images_every == 0 or last_step_cond: self.save_valid_images(res, step, STABILIZATION_MODE) if self.use_Gs: self.save_valid_images(res, step, STABILIZATION_MODE, smoothed=True) self.update_models_weights() remove_old_models(self.model_name, res, stage=STABILIZATION_MODE, max_models_to_keep=self.max_models_to_keep, storage_path=self.storage_path) stabilization_stage_total_time = time.time( ) - stabilization_stage_start_time logging.info( f'Stabilization stage took {stabilization_stage_total_time:.3f} seconds' ) res_total_time = time.time() - res_start_time logging.info( f'Training model of resolution {res} took {res_total_time:.3f} seconds\n\n' ) train_total_time = time.time() - train_start_time logging.info(f'Training finished in {train_total_time:.3f} seconds!')