def get_stats(): files = get_files() total_size_bytes = sum(map(lambda x: os.path.getsize(x), files)) pool = TqdmMultiProcessPool(4) global_tqdm = tqdm.tqdm(total=total_size_bytes, dynamic_ncols=True, unit="byte", unit_scale=1) # Generate minhashes with pool tasks = [(get_file_stats, (file, )) for file in files] def on_done(_): return None def on_error(_): return None results = pool.map(global_tqdm, tasks, on_error, on_done) total_documents, total_size = reduce( lambda x, y: (x[0] + y[0], x[1] + y[1]), results) start_offsets = [] current_offset = 0 for file_document_count, _ in results: start_offsets.append(current_offset) current_offset += file_document_count return (total_documents, total_size, start_offsets)
def compress_and_move(working_directory, output_directory, process_count): os.makedirs(output_directory, exist_ok=True) original_info_file_path = os.path.join(working_directory, "info.json") assert os.path.exists(original_info_file_path) tasks = [] bucket_file_paths = glob.glob( os.path.join(working_directory, "output", f"*.bkt.txt.sorted")) for bucket_file_path in bucket_file_paths: task = (process_task, (working_directory, output_directory, bucket_file_path)) tasks.append(task) pool = TqdmMultiProcessPool(process_count) def on_done(_): return None def on_error(_): return None global_progress = tqdm(total=len(bucket_file_paths), dynamic_ncols=True, unit="file") _ = pool.map(global_progress, tasks, on_error, on_done) shutil.copy(original_info_file_path, os.path.join(output_directory, "info.json"))
def main(process_count, batch_directory): # Ensure LSH object containing cassandra connection info exists lsh_pickle_path = os.path.join(batch_directory, "lsh.pkl") if not os.path.exists(lsh_pickle_path): logger.info("Getting cassandra minhash lsh") lsh = get_minhash_lsh_cassandra() timed_pickle_dump(lsh, lsh_pickle_path, "lsh") files = glob.glob(os.path.join(batch_directory, "batch*.pkl"), recursive=True) pool = TqdmMultiProcessPool() tasks = [] document_count_path = os.path.join(batch_directory, "document_count.pkl") total_documents = pickle.load(open(document_count_path, "rb")) for batch_file in files: arguments = (batch_file, lsh_pickle_path) task = (minhash_lsh_dedupe_cassandra, arguments) tasks.append(task) on_done = lambda _: logger.info("done") on_error = lambda _: logger.info("error") with tqdm.tqdm(total=total_documents, dynamic_ncols=True) as progress: result = pool.map(process_count, progress, tasks, on_error, on_done) logger.info(result)
def generate_minhashes(scrape_directory, process_count): files = glob.glob(os.path.join(scrape_directory, "**/*.minscored"), recursive=True) total_file_size = reduce(add, map(os.path.getsize, files)) logger.info(f"Total File Size: {(total_file_size / million):.2f} MB") # [(file_name1, [doc0_minhash, doc1_minhash, ...]), (file_name2, [....]), ....] with tqdm.tqdm(total=total_file_size, dynamic_ncols=True, unit_scale=1) as progress: pool = TqdmMultiProcessPool() tasks = [] for file_path in files: task = (process_file, (file_path, )) tasks.append(task) on_done = lambda _: None on_error = on_done result = pool.map(process_count, progress, tasks, on_error, on_done) return result
def run_model(num, objective_name, tqdm_func, global_tqdm): seed = random.randrange(10000000) np.random.seed(seed) global num_evaluation model = BayesianOptimization(total=num_evaluation, objective_name=objective_name, progress_bar=global_tqdm, run=num + 1) _, Y_best = model.fit() return Y_best[:num_evaluation] process_count = 6 pool = TqdmMultiProcessPool(process_count) initial_tasks = [(run_model, (i, objective_name)) for i in range(num_run)] total_iterations = num_run * num_evaluation start_time = datetime.now() with tqdm.tqdm(total=total_iterations, dynamic_ncols=True) as global_progress: # BayesianOptimization_Y_best = [] # for i in range(num_run): # BayesianOptimization_Y_best.append(run_model(i, lambda x: x, global_progress)) BayesianOptimization_Y_best = pool.map(global_progress, initial_tasks, error_callback, done_callback) time_of_execution = datetime.now() - start_time
def main( files: str, level: str, interaction_threshold: float, pvalue_threshold: float, ncpus: int, output: str, ): output_path = pathlib.Path(output) assert output_path.exists() base_dir, glob = files.split("*", 1) glob = "*" + glob base_path = pathlib.Path(base_dir) file_paths = [ f for f in base_path.glob(glob) if level in f.parent.parent.stem.split("-") ] # Multiprocessing setup pool = TqdmMultiProcessPool(ncpus) tasks = [(parse_data, (file_path, interaction_threshold, pvalue_threshold)) for file_path in file_paths] task_count = len(tasks) print("Step1: Compiling the X and Y matrices from networks") step_1_2_pickle = pathlib.Path("step_1_2.pkl") if step_1_2_pickle.exists(): with open(step_1_2_pickle, "rb") as fid: x_df = pickle.load(fid) y_df = pickle.load(fid) else: with tqdm.tqdm(total=task_count, dynamic_ncols=True) as global_progress: global_progress.set_description("global") # Step1: For each file note down all the `HEADER` categories and get edge list # Step2: Compile the header values and edge lists using join results = pool.map(global_progress, tasks, error_callback, done_callback) print("Step2a: Creating the dataframes using join") x_all, y_all = zip(*results) x_data = [item for item in x_all if item is not None] y_data = [item for item in y_all if item is not None] x_df: pd.DataFrame = pd.DataFrame(x_data).set_index("hash") y_df = pd.concat(y_data, axis=0, join="outer") assert x_df.shape[0] == y_df.shape[0] print("Step2b: Saving the data") x_df.to_csv(output_path / "x.csv", index=True, sep=",") y_df.to_csv(output_path / "y.csv", index=True, sep=",") with open(step_1_2_pickle, "wb") as fid: pickle.dump(x_df, fid) pickle.dump(y_df, fid) print(x_df.head()) print(y_df.head()) # Step3: Perform PCA on the edge matrix (Y) and then transform Y to PCA coordinates print("Step 3: Performing PCA on Y") step_3_pickle = pathlib.Path("step_3.pkl") if step_3_pickle.exists(): with open(step_3_pickle, "rb") as fid: y_reduced = pickle.load(fid) y_reduced2 = pickle.load(fid) pca = pickle.load(fid) else: y_df.fillna(0.0, inplace=True) # TODO: Set variance to be 0.95 using n_components=0.95 pca = PCA() pca2 = PCA(n_components=2) tsne = TSNE(n_components=2) pca.fit(y_df) pca2.fit(y_df) y_reduced = pd.DataFrame(pca.transform(y_df), index=y_df.index) y_reduced2 = pd.DataFrame(pca2.transform(y_df), index=y_df.index) y_reduced_tsne = pd.DataFrame(tsne.fit_transform(y_df), index=y_df.index) y_reduced.to_csv(output_path / "y_reduced.csv", index=True, sep=",") y_reduced2.to_csv(output_path / "y_reduced2.csv", index=True, sep=",") y_reduced_tsne.to_csv(output_path / "y_reduced_tsne.csv", index=True, sep=",") with open(step_3_pickle, "wb") as fid: pickle.dump(y_reduced, fid) pickle.dump(y_reduced2, fid) pickle.dump(pca, fid) print(y_reduced.head()) # Step4: Perform ANOVA and normalize by total variance print("Step 4: Performing ANOVA and calculating total variance") step_4_pickle = pathlib.Path("step_4.pkl") if step_4_pickle.exists(): with open(step_4_pickle, "rb") as fid: variance_list = pickle.load(fid) percentage_variance = pickle.load(fid) else: anova_dict = perform_anova(x_df, y_reduced) variance_list = normalize_anova(anova_dict, pca) total_variance = sum(variance_list) percentage_variance = total_variance / total_variance.sum() * 100 percentage_variance.to_csv(output_path / "percentage_variance.csv", index=True, sep=",") with open(step_4_pickle, "wb") as fid: pickle.dump(variance_list, fid) pickle.dump(percentage_variance, fid) print(percentage_variance)
def scrape_urls(urls_directory, scrapes_directory, process_count, request_timeout): # Get Total URL count (saved during sqlite extraction) url_count_path = os.path.join(urls_directory, "url_count.json") total_url_count = json.load(open(url_count_path, "r")) # overall progress bar progress = tqdm.tqdm(total=total_url_count, dynamic_ncols=True) progress.set_description("Total URLs") url_files = glob.glob(os.path.join(urls_directory, "urls_*.jsonl.zst")) for url_file_path in url_files: # Skip if previously done done_file_path = url_file_path + ".done" if os.path.exists(done_file_path): batch_url_count = json.load(open(done_file_path, "r")) progress.update(batch_url_count) logger.info( f"'{os.path.basename(url_file_path)}' already scraped, skipping." ) logger.info(f"Scraping URLs from '{os.path.basename(url_file_path)}'.") reader = Reader() url_data = [] for url, reddit_meta in reader.read_jsonl(url_file_path, get_meta=True): url_data.append((url, reddit_meta)) timer = Timer().start() batch_progress = tqdm.tqdm(total=len(url_data), dynamic_ncols=True) batch_progress.set_description(f"{os.path.basename(url_file_path)}") # Download and Process With Pool pool = TqdmMultiProcessPool() tasks = [] for url_entry in url_data: arguments = (url_entry, request_timeout, newspaper_scraper, False) task = (download, arguments) tasks.append(task) on_done = lambda _: progress.update() on_error = lambda _: None results = pool.map(process_count, batch_progress, tasks, on_error, on_done) logger.info("Archiving chunk with lm_dataformat...") # urls_*.jsonl.zst -> scrapes_*.jsonl.zst output_archive_name = os.path.basename(url_file_path).replace( "urls", "scrapes") output_archive_path = os.path.join(scrapes_directory, output_archive_name) archiver = Archive(output_archive_path) batch_error_count = 0 for text, meta, status in results: if not status: batch_error_count += 1 else: archiver.add_data(text, meta) archiver.commit() error_percentage = batch_error_count / len(url_data) * 100 logger.info( f"Errors: {batch_error_count} / {len(url_data)} ({error_percentage:0.2f}%)" ) logger.info(f"Batch time: {timer.stop():0.2f} seconds") progress.update(len(url_data)) batch_progress.close() json.dump(len(url_data), open(done_file_path, "w")) progress.close() logger.info("Done!")