def main(): filenames = sorted([ f for f in os.listdir(os.path.join(BASE_PATH, RECORDER, SEGMENT_DIR)) if ".wav" in f ]) p_map(save_features, filenames[100:900], num_cpus=6)
def process_tomo(self, averages, maxiter, iswap4 = False, iswap2 = False, test = False, iswap = False): #start only after run_iswap density_matrix_after_gate = [] #for ind, rho_initial in enumerate(self._rho_prepared_states): #density_matrix_after_gate.append(self._find_rho_iswap(3,self._results_iswap[ind])) # можно распараллелить find_rho_partial = partial(Tomography._find_rho_iswap,self,averages) if (iswap4): density_matrix_after_gate = p_map(find_rho_partial, self._results_4iswap ,num_cpus=24) elif (iswap2): density_matrix_after_gate = p_map(find_rho_partial, self._results_2iswap ,num_cpus=24) elif (test): density_matrix_after_gate_3dim = self._results_iswap_test for dm in density_matrix_after_gate_3dim: density_matrix_after_gate.append(Tomography.rho3dim_to_rho(dm)) elif (iswap): density_matrix_after_gate = p_map(find_rho_partial, self._results_iswap ,num_cpus=24) density_matrix_after_gate_full = [] for dm in density_matrix_after_gate: density_matrix_after_gate_full.append(dm.full()) x0 = random.rand(256) for n in tqdm_notebook(range(averages), desc='Tomography iswap_operator: Likelihood minimization', ncols=700): new = minimize (self._likelihood_iswap, x0, args = (density_matrix_after_gate_full), method='BFGS', tol= 1e-09,\ options = {'gtol': 1e-05,'maxiter': maxiter}) if (n==0): best = new elif (new.fun < best.fun): best = new x0 =best.x + 0.1*(random.rand(256)*2 - 1) print ('error is', best.fun) return self.x_to_chi(best.x)
def _extract_dataset_frames(video_files, extracted_image_dirs, n_frame_files, n_workers): p_map(_extract_video_frames, video_files, extracted_image_dirs, n_frame_files, num_cpus=n_workers)
def find_appropriate_phase(self): phi1s = linspace (0, 2*pi, 40) phi2s = linspace (0, 2*pi, 40) errors_1_array = p_map(Tomography.error_function_1, [self]*len(phi1s), phi1s, [0]*len(phi1s), num_cpus = 20) errors_2_array = p_map(Tomography.error_function_2, [self]*len(phi2s), [0]*len(phi2s), phi2s, num_cpus = 20) X = errors_1_array Y = errors_2_array Z = np.empty((len(Y), len(X))) for i in range (len(Y)): for j in range (len(X)): Z[i,j] = Y[i]**2 + X[j]**2 f = interp2d(phi1s,phi2s,Z) def g (phi_array): return f(phi_array[0], phi_array[1])[0] result = minimize (g, array([1,1])) phi1 = result.x[0] phi2 = result.x[1] return phi1, phi2
def scrape(query, directory, max_downloads, all_pages): """ Scrapes audio files that match the user query. Arguments: query (str): Xeno canto search query. directory (str): Directory to save audio files to. max_downloads (int): The maximum amount of downloads. all_pages (bool): Whether to download more than one response page. Returns: None: Does not return anything. Does write audio files to a specified directory. """ current_page = 1 query, species = remove_species(query) response_dict = post_query(query, current_page) total_length = len(response_dict['recordings']) response_pages = [response_dict] if all_pages: while len(response_dict['recordings']) == 500: current_page += 1 response_dict = post_query(query, current_page) total_length += len(response_dict['recordings']) response_pages.append(response_dict) print('A total of {} files have been found.'.format(total_length)) print('Commencing downloads.') for response_dict in tqdm(response_pages): recordings_to_fetch = list(response_dict['recordings']) recordings_to_fetch = recordings_to_fetch[:max_downloads] species_list = [species] * len(recordings_to_fetch) directory_list = [directory] * len(recordings_to_fetch) p_map(fetch_single, recordings_to_fetch, species_list, directory_list)
def dicom_to_png_matlab(dicom_paths, image_paths, selection_criteria, skip_existing=True): """Converts a dicom image to a grayscale 16-bit png image using matlab. NOTE: Must be run from oncodata/dicom_to_png directory so that Matlab can find the dicomToPng.m conversion script. Arguments: dicom_paths(list[str]): A list of paths to dicom files. image_paths(list[str]): A list of paths where the images will be saved. skip_existing(bool): True to skip images which already exist. """ if len(dicom_paths) != len(image_paths): print('Error: DICOM paths and image paths must be the same length.') exit() dicom_paths = np.array(dicom_paths) image_paths = np.array(image_paths) if skip_existing: print('Checking for existing images') keep = p_map(lambda image_path: not os.path.exists(image_path), image_paths) keep_indices = np.where(keep) dicom_paths = dicom_paths[keep_indices] image_paths = image_paths[keep_indices] # Ensure that dicoms meet selection criteria and only have one slice print('Checking for invalid dicoms') keep = p_map( lambda dicom_path: is_selected_dicom(dicom_path, selection_criteria) and has_one_slice(dicom_path), dicom_paths) keep_indices = np.where(keep) dicom_paths = dicom_paths[keep_indices] image_paths = image_paths[keep_indices] if len(dicom_paths) == 0: return # Create directory for images if necessary print('Creating directories for images') p_umap(create_directory_if_necessary, image_paths) # Save paths to temporary files which will be loaded by matlab with NamedTemporaryFile(suffix='.txt') as dicoms_file: with NamedTemporaryFile(suffix='.txt') as images_file: np.savetxt(dicoms_file.name, dicom_paths, fmt='%s') np.savetxt(images_file.name, image_paths, fmt='%s') # Convert DICOM to PNG using matlab print('Converting with matlab') Popen([ 'matlab', '-nodisplay', '-nodesktop', '-nojvm', '-r', "dicomToPng('%s', '%s'); exit;" % (dicoms_file.name, images_file.name) ]).wait()
def prep_graph_BP(self, out=True): print('Preparing B', file=sys.stdout) Bs = p_map(HINProcess._prep_graph_B, self.infos, num_cpus=self.nproc) print('Preparing P', file=sys.stdout) Ps = p_map(HINProcess._prep_graph_P, self.infos, num_cpus=self.nproc) if out: HINProcess._save_interim_BP(Bs, Ps, self.csvs, self.nproc) return Bs, Ps
def create_all_images( config: ImageConfig, output_path: Path, snapshot_path: Path, catalogue_path: Path, parallel: bool = False, debug: bool = False, ): """ Create all images, given a config and a set of snapshots and catalogues. Parameters ---------- config: ImageConfig Complete image configuration object, containing importantly the ``raw_images`` list. output_path: Path, str Output path to save images to. Inside this path, there will be a number of directories created (one per halo). This path must already exist. snapshot_path: Path, str Path to the snapshot (``/path/to/output_0000.hdf5``). catalogue_path: Path, str Path to the catalogue (``/path/to/halo_0000.properties``). parallel: bool, optional Whether or not to create all images in parallel with each other (uses p_tqdm). debug: bool, optional Whether or not to print out the progress of the image creation """ haloes = haloes_to_visualise(config=config, catalogue_path=catalogue_path) def packed_vis(halo): visualise_halo( output_path=output_path, snapshot_path=snapshot_path, config=config, halo=halo, ) if parallel: p_map(packed_vis, haloes, disable=not debug) else: list(map(packed_vis, tqdm(haloes, disable=not debug))) build_webpage(config=config, haloes=haloes, output_path=output_path) pass
def Main(): parser = argparse.ArgumentParser() parser.add_argument("m1", type=int, required=True, help='Get Previous Issues for this Movement Type') parser.add_argument("m2", type=int, required=True, help='Get Returns for this Movement Type') args = parser.parse_args() global m1 global m2 m1 = args.m1 m2 = args.m2 movement_type_subset = read_in_data() start = time.time() print("start chunking") gb = movement_type_subset.groupby(['material_number']) n_items = list(gb.groups.items()) n_items = dict(n_items) list_groups = [gb.get_group(x) for x in tqdm(n_items, ascii=True)] end = time.time() print("chunking done in " + str(end - start)) start = time.time() print("start looping") NUM_CORES = 28 with multiprocessing.Pool(NUM_CORES) as pool: if m1 == 201 and m2 == 202: netted_results_cost = pd.concat(p_map(process_df_function, list_groups), axis=0, ignore_index=True) cleaning_tools.save_parquet(netted_results_cost, 'netted_results_cost.parquet.gzip', index=False, save_dir=primary_dir) elif m1 == 261 and m2 == 262: netted_results_orders = pd.concat(p_map(process_df_function, list_groups), axis=0, ignore_index=True) cleaning_tools.save_parquet(netted_results_orders, 'netted_results_orders.parquet.gzip', index=False, save_dir=primary_dir) else: netted_results_project = pd.concat(p_map(process_df_function, list_groups), axis=0, ignore_index=True) cleaning_tools.save_parquet(netted_results_project, 'netted_results_project.parquet.gzip', index=False, save_dir=primary_dir)
def prep_graph_BP(self): print('Preparing B', file=sys.stdout) Bs = p_map(HINProcess._prep_graph_B, self.infos, self.csvs, num_cpus=self.nproc) print('Preparing P', file=sys.stdout) Ps = p_map(HINProcess._prep_graph_P, self.infos, self.csvs, num_cpus=self.nproc) return Bs, Ps
def create_states_map_with_id(colors_replacement_dict, provinces_image, water_color, font_name): state_pixels = defaultdict(list) water_color = (water_color[0], water_color[1], water_color[2]) pixels = provinces_image.load() print("Coloring pixels...") for i, j in tqdm(itertools.product(range(provinces_image.size[0]), range(provinces_image.size[1])), total=provinces_image.size[0] * provinces_image.size[1]): if pixels[i, j] in colors_replacement_dict: res = colors_replacement_dict[pixels[i, j]] pixels[i, j] = res[0] state_pixels[res[1]].append((i, j)) else: pixels[i, j] = water_color draw = ImageDraw.Draw(provinces_image) try: font = ImageFont.truetype(font_name, 10) except: print( "Font " + font_name + "not found, using system default. This probably won't look good.") font = ImageFont.load_default() size = provinces_image.size for key, value in state_pixels.items(): state_pixels[key] = (value, font.getsize(str(key))) print("Generating ID positions...") positions = p_tqdm.p_map(find_id_position, list(state_pixels.items()), size) for pos, state in positions: draw.text(pos, str(state), fill="black", font=font)
def remove_intra_class_duplicates(folder): # find intra-class image pathes image_pathes = find_pathes(folder) # compute hashes hashes = compute_hashes(image_pathes) # compute similiarities sims = compute_similarity(hashes) # find duplicates duplicates = find_duplicates(similarity=sims, image_pathes=image_pathes) # remove duplicates p_map(os.remove, duplicates)
def parallelize(data, func, num_of_processes=8): data_split = np.array_split(data, num_of_processes) pool = Pool(num_of_processes) data = pd.concat(p_map(func, data_split)) pool.close() pool.join() return data
def get_camera_stats_per_file(parallel=False, plot=False): if os.path.exists(POSE_STATS_PATH): params = load_from_pickle(POSE_STATS_PATH) else: dataset = SUNRGBDWorld(192, 240, 'all', return_semantics=False) def get_single_element(i): elem = dataset.__getitem__(i) simplified_path = elem['path'].replace( dataset.base_path + '/SUNRGBD/', '') return [simplified_path, elem['params']] if parallel: from p_tqdm import p_map params = p_map(get_single_element, list(range(len(dataset))), num_cpus=32) else: params = [] for i in tqdm(range(len(dataset))): params.append(get_single_element(i)) params = dict(params) dump_to_pickle(POSE_STATS_PATH, params) all_params = np.concatenate([k[None, :] for k in params.values()]) if plot: visdom_histogram(all_params[:, 0], title='SUNRGBD_fov_x_deg') visdom_histogram(all_params[:, 1], title='SUNRGBD_height') visdom_histogram(all_params[:, 2], title='SUNRGBD_pitch_deg') visdom_histogram(all_params[:, 3], title='SUNRGBD_roll_deg') return params
def calculate_all(self): """ The top level. """ print("Green's function Calculation started.") npole = len(self.contour.path) if self.np == 1: results = map(self.get_AijR_rhoR, tqdm(self.contour.path, total=npole)) else: #pool = ProcessPool(nodes=self.np) #results = pool.map(self.get_AijR_rhoR ,self.contour.path) results = p_map(self.get_AijR_rhoR, self.contour.path, num_cpus=self.np) for i, result in enumerate(results): rup, rdn, Jorb_list, JJ_list = result self.rho_up_list.append(rup) self.rho_dn_list.append(rdn) for iR, R in enumerate(self.R_ijatom_dict): for (iatom, jatom) in self.R_ijatom_dict[R]: key = (R, iatom, jatom) self.Jorb_list[key].append(Jorb_list[key]) self.JJ_list[key].append(JJ_list[key]) if self.np > 1: pass #pool.close() #pool.join() #pool.clear() self.integrate() self.get_rho_atom() self.A_to_Jtensor()
def parallel_solve(self, instances, n_jobs=4, label="Solve", collect_training_data=True, ): self.internal_solver = None SOLVER[0] = self INSTANCES[0] = instances p_map_results = p_map(_parallel_solve, list(range(len(instances))), num_cpus=n_jobs, desc=label) results = [p["Results"] for p in p_map_results] for (idx, r) in enumerate(p_map_results): instances[idx].solution = r["Solution"] instances[idx].lp_solution = r["LP solution"] instances[idx].lp_value = r["LP value"] instances[idx].lower_bound = r["Lower bound"] instances[idx].upper_bound = r["Upper bound"] instances[idx].found_violations = r["Violations"] return results
def parallel_download_images(output_dir, download_data, num_cpus=10, max_num_images=None): # iterate over data and attempt downloads in a parallel manner download_data_items = download_data.items() image_name = list(download_data.keys()) image_data = list(download_data.values()) if max_num_images is not None: image_name = image_name[:max_num_images] image_data = image_data[:max_num_images] print("Potentially downloading {} images.".format(len(image_name))) image_url = [i_d["url"] for i_d in image_data] image_filename = [os.path.join(output_dir, i_n) for i_n in image_name] success_count = 0 success = p_map(download_image, image_filename, image_url, num_cpus=num_cpus) downloaded_data = {} failed_data = {} for i, s in enumerate(success): if s: downloaded_data[image_name[i]] = image_data[i] success_count += 1 else: failed_data[image_name[i]] = image_data[i] print("Downloaded {} images.".format(success_count)) return downloaded_data, failed_data
def parse(self, n_jobs=12): if n_jobs > len(self.DOIs): n_jobs = len(self.DOIs) # with Pool(processes=n_jobs) as pool: # data = list(tqdm( # pool.imap( # MedBioRxivScraper.parse_article, # self.DOIs, # chunksize=len(self.DOIs)//n_jobs # ), # total = len(self.DOIs) # )) data = p_map(MedBioRxivScraper.parse_article, self.DOIs, num_cpus=n_jobs) self.data = pd.DataFrame(data, columns=[ 'authors', 'affiliations', 'title', 'pub_date', 'abstract', 'doi' ]) self.data.pub_date = pd.to_datetime(self.data.pub_date)
def scrape_reviews(url): totalReviewers = [] totalRatings = [] totalReviewDescriptions = [] totalReviewTitles = [] totalPages, pageTitle, totalReviews = extractTotalPages(url) print(f"[scrape-amazon] - {pageTitle}") print(f"[scrape-amazon] Total Pages - {totalPages}") print(f"[scrape-amazon] Total Reviews - {totalReviews}\n") urlsToFetch = [] for page in range(1, totalPages + 1): urlToFetch = url + f"?pageNumber={page}" urlsToFetch.append(urlToFetch) results = p_map(extractPage, urlsToFetch) res = {} for k in results: for list in k: if list in res: res[list] += (k[list]) else: res[list] = k[list] productReviewsData = pd.DataFrame() # # Adding Information productReviewsData["Reviewer"] = res['reviewers'] productReviewsData["Rating"] = res['ratings'] productReviewsData["Title"] = res['reviewTitles'] productReviewsData["Description"] = res['reviewDescriptions'] # productReviewsData["link"] = url # productReviewsData["Product Title"] = pageTitle return productReviewsData
def find_rho_all_parallel (self, averages, x_start = random.rand(16)): #with Pool(4) as p: # self.rho_tomo = [] find_rho_avg = partial(Tomography.find_rho, self, averages) # for item in tqdm(p.imap(find_rho_avg, range(len(self._2q_rotations)))): # self.rho_tomo.append(item) self.rho_tomo = p_map (find_rho_avg, range(len(self._2q_rotations)), num_cpus = 20)
def get_annotation_path_list(self, multiprocess=False, sort=True): annot_path_list = [(root, file_name) for root, dirs, files in tqdm( os.walk(self.config['dataset_root']), desc="Searching annotation .json files ...") if len(files) > 1 for file_name in files if file_name.endswith("json")] print("Annotation list: {}".format(len(annot_path_list))) seperator = "\\" if platform.system().find("Windows") >= 0 else "/" file_checker = lambda x: x if os.path.isfile(f"{x[0]}{seperator}{x[1]}") and \ (os.path.isfile(f"{x[0]}{seperator}{x[1][:-5]}.JPG") or os.path.isfile(f"{x[0]}{seperator}{x[1][:-5]}.jpg")) else None if multiprocess: annot_path_list = p_map(file_checker, annot_path_list, desc="Double-Check File List ...") annot_path_list = [x for x in annot_path_list if x is not None] else: annot_path_list = [ (root, file_name) for root, file_name in tqdm(annot_path_list, desc="Double-Check File List ...") if os.path.isfile(f"{root}{seperator}{file_name}") and (os.path.isfile(f"{root}{seperator}{file_name[:-5]}.JPG") or os.path.isfile(f"{root}{seperator}{file_name[:-5]}.jpg")) ] if sort: annot_path_list.sort() print("Annotation list: {}".format(len(annot_path_list))) return annot_path_list
def _process(self, embeddings_path): global prior_data prior_data = collect_prior_data(self.metadata["output_dir"]) # print("collected prior data", len(prior_data)) global metadata metadata = self.metadata global global_analyzer global_analyzer = self.ldt_analyzer filename = self.get_fname_for_embedding(embeddings_path) neighbor_file_path = os.path.join( self.output_dir.replace("neighbors_annotated", "neighbors"), filename + ".tsv") print("\nAnnotating " + neighbor_file_path) self.metadata["out_path"] = os.path.join(self.output_dir, filename + ".tsv") input_df = pd.read_csv(neighbor_file_path, header=0, sep="\t") self.metadata["total_pairs"] += len(input_df) dicts = input_df.to_dict(orient="records") if metadata["multiprocessing"] == 1: print("\nMultiprocessing: 1 core") newdicts = [] for d in tqdm(dicts): newdicts.append(_process_one_dict(d)) # # newdicts.append(self._process_one_dict_meth(d)) dicts = newdicts # dicts = [_process_one_dict(x) for x in dicts] # self.save_results(dicts) else: print("\nMultiprocessing:", metadata["multiprocessing"], "cores") #python multiprocessing library # pool = Pool(metadata["multiprocessing"], initializer=initializer(global_analyzer)) # dicts = pool.map(_process_one_dict, dicts) # #pathos.multiprocessing # pool = ProcessingPool(nodes=metadata["multiprocessing"]) # dicts = pool.map(_process_one_dict, dicts) #try with method # t_dicts = [] # for d in dicts: # t_dicts.append((d,)) # pool = ProcessingPool(nodes=metadata["multiprocessing"]) # dicts = pool.map(self._process_one_dict_meth, dicts) dicts = p_map(_process_one_dict, dicts, num_cpus=metadata["multiprocessing"]) # self.save_results(dicts) # pool = MyPool(metadata["multiprocessing"]) # dicts = pool.map(_process_one_dict, dicts) # pool.close() # pool.join() dicts = self.add_distr_data(dicts) self.save_results(dicts, overwrite=True)
def process_center_availabilities_once(center_data_links, notify=False): proc_units = min((cpu_count() - 1), len(center_data_links)) res = None with Pool(proc_units): res = p_map(retrieve_center_data, center_data_links) centers_with_slots = 0 total_slots_found = 0 for r in res: if len(r) > 0 and int(r[0]) > 0: centers_with_slots += 1 total_slots_found += int(r[0]) print("ALERT:") print(" {} slots available at {} ({})".format( r[0], r[2], r[1])) print(" -> {}".format(r[3])) if args.auto_browse == "Brave": options = Options() options.binary_location = '/Applications/Brave Browser.app/Contents/MacOS/Brave Browser' driver_path = '/usr/local/bin/chromedriver' drvr = webdriver.Chrome(options=options, executable_path=driver_path) drvr.get(r[3]) if args.notify: os_notify(title='Chronoslots scraper alert', subtitle='{} slots founds on {} centers'.format( total_slots_found, centers_with_slots), message='Check your terminal to clink on links!')
def eval_predictions(pred_dir, anno_dir, width=30, unofficial=True, sequential=False): """Evaluates the predictions in pred_dir and returns CULane's metrics (precision, recall, F1 and its components)""" print(f'Loading annotation data ({anno_dir})...') annotations, label_paths = load_labels(anno_dir) print(f'Loading prediction data ({pred_dir})...') predictions = load_prediction_list(label_paths, pred_dir) print('Calculating metric {}...'.format('sequentially' if sequential else 'in parallel')) if sequential: results = t_map(partial(culane_metric, width=width, unofficial=unofficial, img_shape=LLAMAS_IMG_RES), predictions, annotations) else: results = p_map(partial(culane_metric, width=width, unofficial=unofficial, img_shape=LLAMAS_IMG_RES), predictions, annotations) total_tp = sum(tp for tp, _, _ in results) total_fp = sum(fp for _, fp, _ in results) total_fn = sum(fn for _, _, fn in results) if total_tp == 0: precision = 0 recall = 0 f1 = 0 else: precision = float(total_tp) / (total_tp + total_fp) recall = float(total_tp) / (total_tp + total_fn) f1 = 2 * precision * recall / (precision + recall) return {'TP': total_tp, 'FP': total_fp, 'FN': total_fn, 'Precision': precision, 'Recall': recall, 'F1': f1}
def resample(input_path, output_path, sample_rate, num_workers=1, audio_extensions=['.wav', '.mp3', '.aac']): """ Resamples a folder of audio files into a copy of the same folder with the same structure but with every audio file replaced with a resampled version of that audio file. Relative paths to the audio file from the root of the folder will be the same. Args: input_path (str): Root of folder where all audio files will be resampled. output_path (str): Root of folder where all resampled files will be placed. Will match the same structure as the input_path folder structure. sample_rate (int): Sample rate to resample files to. num_workers (int, optional): How many workers to use in parallel to resample files. Defaults to 1. audio_extensions (list, optional): Audio extensions to look for in the input_path. Matching ones will be resampled and placed in the output_path at the same relative location. Defaults to ['.wav', '.mp3', '.aac']. """ try: shutil.copytree(input_path, output_path, ignore=ig_f) except: pass input_audio_files = [] for ext in audio_extensions: input_audio_files += glob.glob( f"{input_path}/**/*{ext}", recursive=True ) output_audio_files = [ x.replace(input_path, output_path) for x in input_audio_files ] indices = list(range(len(input_audio_files))) args = [ [input_audio_files[i] for i in indices], [output_audio_files[i][:-4] + '.wav' for i in indices], sample_rate, False ] p_tqdm.p_map(resample_audio_file, *args, num_cpus=num_workers)
def __init__(self, csvs, out_dir, nproc=4): self.csvs = csvs self.out_dir = out_dir self.nproc = nproc self.packages = [os.path.basename(csv)[:-4] for csv in csvs] print('Processing CSVs') self.infos = p_map(HINProcess.csv_proc, csvs, num_cpus=nproc) self.prep_ids()
def parallel_delete(foldername, max_level_to_parallelize, workers=50): # finds folders recursively up to n levels def single_delete(folder): try: rmtree(folder) except: pass for actual_level in range(max_level_to_parallelize, -1, -1): print('Listing at level: {}'.format(actual_level)) level_files = glob.glob(foldername + '/' + '*/' * actual_level) #level_dirs = filter(os.path.isdir, level_filter) print('{} files found at level: {}'.format(len(level_files), actual_level)) print('Starting parallel delete at level: {}'.format(actual_level)) p_map(single_delete, level_files, num_cpus=workers) print('End parallel delete at level: {}'.format(actual_level))
def createMandelbrotSet(): real = np.linspace(minReal, maxReal, resolution, endpoint=False) result = p_map(isRowInMandelbrot, real.tolist()) result = np.array(result, dtype=np.bool) return result
def do_work(file_paths): """ Enable multiprocessing. Inherently implement multiprocessing. :param num_process: :return: """ result = p_map(extract_info, file_paths) return result
def get_unique_standardized_smiles(path: str, standardize: bool) -> Set[str]: with open(path) as f: smiles = [row['smiles'] for row in csv.DictReader(f)] if standardize: smiles = p_map(standardize_smiles, smiles) smiles = set(smiles) return smiles