def get_stats():
    files = get_files()
    total_size_bytes = sum(map(lambda x: os.path.getsize(x), files))

    pool = TqdmMultiProcessPool(4)
    global_tqdm = tqdm.tqdm(total=total_size_bytes,
                            dynamic_ncols=True,
                            unit="byte",
                            unit_scale=1)

    # Generate minhashes with pool
    tasks = [(get_file_stats, (file, )) for file in files]

    def on_done(_):
        return None

    def on_error(_):
        return None

    results = pool.map(global_tqdm, tasks, on_error, on_done)

    total_documents, total_size = reduce(
        lambda x, y: (x[0] + y[0], x[1] + y[1]), results)

    start_offsets = []
    current_offset = 0
    for file_document_count, _ in results:
        start_offsets.append(current_offset)
        current_offset += file_document_count

    return (total_documents, total_size, start_offsets)
Esempio n. 2
0
def compress_and_move(working_directory, output_directory, process_count):
    os.makedirs(output_directory, exist_ok=True)
    original_info_file_path = os.path.join(working_directory, "info.json")
    assert os.path.exists(original_info_file_path)

    tasks = []
    bucket_file_paths = glob.glob(
        os.path.join(working_directory, "output", f"*.bkt.txt.sorted"))
    for bucket_file_path in bucket_file_paths:
        task = (process_task, (working_directory, output_directory,
                               bucket_file_path))
        tasks.append(task)

    pool = TqdmMultiProcessPool(process_count)

    def on_done(_):
        return None

    def on_error(_):
        return None

    global_progress = tqdm(total=len(bucket_file_paths),
                           dynamic_ncols=True,
                           unit="file")
    _ = pool.map(global_progress, tasks, on_error, on_done)

    shutil.copy(original_info_file_path,
                os.path.join(output_directory, "info.json"))
Esempio n. 3
0
def main(process_count, batch_directory):

    # Ensure LSH object containing cassandra connection info exists
    lsh_pickle_path = os.path.join(batch_directory, "lsh.pkl")
    if not os.path.exists(lsh_pickle_path):
        logger.info("Getting cassandra minhash lsh")
        lsh = get_minhash_lsh_cassandra()
        timed_pickle_dump(lsh, lsh_pickle_path, "lsh")

    files = glob.glob(os.path.join(batch_directory, "batch*.pkl"),
                      recursive=True)

    pool = TqdmMultiProcessPool()
    tasks = []

    document_count_path = os.path.join(batch_directory, "document_count.pkl")
    total_documents = pickle.load(open(document_count_path, "rb"))

    for batch_file in files:
        arguments = (batch_file, lsh_pickle_path)
        task = (minhash_lsh_dedupe_cassandra, arguments)
        tasks.append(task)

    on_done = lambda _: logger.info("done")
    on_error = lambda _: logger.info("error")
    with tqdm.tqdm(total=total_documents, dynamic_ncols=True) as progress:
        result = pool.map(process_count, progress, tasks, on_error, on_done)
        logger.info(result)
Esempio n. 4
0
def generate_minhashes(scrape_directory, process_count):
    files = glob.glob(os.path.join(scrape_directory, "**/*.minscored"),
                      recursive=True)
    total_file_size = reduce(add, map(os.path.getsize, files))
    logger.info(f"Total File Size: {(total_file_size / million):.2f} MB")

    # [(file_name1, [doc0_minhash, doc1_minhash, ...]), (file_name2, [....]), ....]
    with tqdm.tqdm(total=total_file_size, dynamic_ncols=True,
                   unit_scale=1) as progress:
        pool = TqdmMultiProcessPool()
        tasks = []
        for file_path in files:
            task = (process_file, (file_path, ))
            tasks.append(task)

        on_done = lambda _: None
        on_error = on_done
        result = pool.map(process_count, progress, tasks, on_error, on_done)

    return result

def run_model(num, objective_name, tqdm_func, global_tqdm):
    seed = random.randrange(10000000)
    np.random.seed(seed)
    global num_evaluation
    model = BayesianOptimization(total=num_evaluation,
                                 objective_name=objective_name,
                                 progress_bar=global_tqdm,
                                 run=num + 1)
    _, Y_best = model.fit()
    return Y_best[:num_evaluation]


process_count = 6
pool = TqdmMultiProcessPool(process_count)

initial_tasks = [(run_model, (i, objective_name)) for i in range(num_run)]

total_iterations = num_run * num_evaluation

start_time = datetime.now()

with tqdm.tqdm(total=total_iterations, dynamic_ncols=True) as global_progress:
    # BayesianOptimization_Y_best = []
    # for i in range(num_run):
    #     BayesianOptimization_Y_best.append(run_model(i, lambda x: x, global_progress))
    BayesianOptimization_Y_best = pool.map(global_progress, initial_tasks,
                                           error_callback, done_callback)

time_of_execution = datetime.now() - start_time
def main(
    files: str,
    level: str,
    interaction_threshold: float,
    pvalue_threshold: float,
    ncpus: int,
    output: str,
):
    output_path = pathlib.Path(output)
    assert output_path.exists()

    base_dir, glob = files.split("*", 1)
    glob = "*" + glob
    base_path = pathlib.Path(base_dir)
    file_paths = [
        f for f in base_path.glob(glob)
        if level in f.parent.parent.stem.split("-")
    ]

    # Multiprocessing setup
    pool = TqdmMultiProcessPool(ncpus)
    tasks = [(parse_data, (file_path, interaction_threshold, pvalue_threshold))
             for file_path in file_paths]
    task_count = len(tasks)

    print("Step1: Compiling the X and Y matrices from networks")
    step_1_2_pickle = pathlib.Path("step_1_2.pkl")
    if step_1_2_pickle.exists():
        with open(step_1_2_pickle, "rb") as fid:
            x_df = pickle.load(fid)
            y_df = pickle.load(fid)
    else:
        with tqdm.tqdm(total=task_count,
                       dynamic_ncols=True) as global_progress:
            global_progress.set_description("global")
            # Step1: For each file note down all the `HEADER` categories and get edge list
            # Step2: Compile the header values and edge lists using join
            results = pool.map(global_progress, tasks, error_callback,
                               done_callback)
        print("Step2a: Creating the dataframes using join")
        x_all, y_all = zip(*results)
        x_data = [item for item in x_all if item is not None]
        y_data = [item for item in y_all if item is not None]
        x_df: pd.DataFrame = pd.DataFrame(x_data).set_index("hash")
        y_df = pd.concat(y_data, axis=0, join="outer")
        assert x_df.shape[0] == y_df.shape[0]
        print("Step2b: Saving the data")
        x_df.to_csv(output_path / "x.csv", index=True, sep=",")
        y_df.to_csv(output_path / "y.csv", index=True, sep=",")
        with open(step_1_2_pickle, "wb") as fid:
            pickle.dump(x_df, fid)
            pickle.dump(y_df, fid)

    print(x_df.head())
    print(y_df.head())

    # Step3: Perform PCA on the edge matrix (Y) and then transform Y to PCA coordinates
    print("Step 3: Performing PCA on Y")
    step_3_pickle = pathlib.Path("step_3.pkl")
    if step_3_pickle.exists():
        with open(step_3_pickle, "rb") as fid:
            y_reduced = pickle.load(fid)
            y_reduced2 = pickle.load(fid)
            pca = pickle.load(fid)
    else:
        y_df.fillna(0.0, inplace=True)
        # TODO: Set variance to be 0.95 using n_components=0.95
        pca = PCA()
        pca2 = PCA(n_components=2)
        tsne = TSNE(n_components=2)
        pca.fit(y_df)
        pca2.fit(y_df)
        y_reduced = pd.DataFrame(pca.transform(y_df), index=y_df.index)
        y_reduced2 = pd.DataFrame(pca2.transform(y_df), index=y_df.index)
        y_reduced_tsne = pd.DataFrame(tsne.fit_transform(y_df),
                                      index=y_df.index)
        y_reduced.to_csv(output_path / "y_reduced.csv", index=True, sep=",")
        y_reduced2.to_csv(output_path / "y_reduced2.csv", index=True, sep=",")
        y_reduced_tsne.to_csv(output_path / "y_reduced_tsne.csv",
                              index=True,
                              sep=",")
        with open(step_3_pickle, "wb") as fid:
            pickle.dump(y_reduced, fid)
            pickle.dump(y_reduced2, fid)
            pickle.dump(pca, fid)

    print(y_reduced.head())

    # Step4: Perform ANOVA and normalize by total variance
    print("Step 4: Performing ANOVA and calculating total variance")
    step_4_pickle = pathlib.Path("step_4.pkl")
    if step_4_pickle.exists():
        with open(step_4_pickle, "rb") as fid:
            variance_list = pickle.load(fid)
            percentage_variance = pickle.load(fid)
    else:
        anova_dict = perform_anova(x_df, y_reduced)
        variance_list = normalize_anova(anova_dict, pca)
        total_variance = sum(variance_list)
        percentage_variance = total_variance / total_variance.sum() * 100
        percentage_variance.to_csv(output_path / "percentage_variance.csv",
                                   index=True,
                                   sep=",")
        with open(step_4_pickle, "wb") as fid:
            pickle.dump(variance_list, fid)
            pickle.dump(percentage_variance, fid)

    print(percentage_variance)
Esempio n. 7
0
def scrape_urls(urls_directory, scrapes_directory, process_count,
                request_timeout):

    # Get Total URL count (saved during sqlite extraction)
    url_count_path = os.path.join(urls_directory, "url_count.json")
    total_url_count = json.load(open(url_count_path, "r"))

    # overall progress bar
    progress = tqdm.tqdm(total=total_url_count, dynamic_ncols=True)
    progress.set_description("Total URLs")

    url_files = glob.glob(os.path.join(urls_directory, "urls_*.jsonl.zst"))

    for url_file_path in url_files:
        # Skip if previously done
        done_file_path = url_file_path + ".done"
        if os.path.exists(done_file_path):
            batch_url_count = json.load(open(done_file_path, "r"))
            progress.update(batch_url_count)
            logger.info(
                f"'{os.path.basename(url_file_path)}' already scraped, skipping."
            )

        logger.info(f"Scraping URLs from '{os.path.basename(url_file_path)}'.")

        reader = Reader()
        url_data = []
        for url, reddit_meta in reader.read_jsonl(url_file_path,
                                                  get_meta=True):
            url_data.append((url, reddit_meta))

        timer = Timer().start()

        batch_progress = tqdm.tqdm(total=len(url_data), dynamic_ncols=True)
        batch_progress.set_description(f"{os.path.basename(url_file_path)}")

        # Download and Process With Pool
        pool = TqdmMultiProcessPool()
        tasks = []
        for url_entry in url_data:
            arguments = (url_entry, request_timeout, newspaper_scraper, False)
            task = (download, arguments)
            tasks.append(task)

        on_done = lambda _: progress.update()
        on_error = lambda _: None
        results = pool.map(process_count, batch_progress, tasks, on_error,
                           on_done)

        logger.info("Archiving chunk with lm_dataformat...")
        # urls_*.jsonl.zst -> scrapes_*.jsonl.zst
        output_archive_name = os.path.basename(url_file_path).replace(
            "urls", "scrapes")
        output_archive_path = os.path.join(scrapes_directory,
                                           output_archive_name)
        archiver = Archive(output_archive_path)
        batch_error_count = 0
        for text, meta, status in results:
            if not status:
                batch_error_count += 1
            else:
                archiver.add_data(text, meta)
        archiver.commit()

        error_percentage = batch_error_count / len(url_data) * 100
        logger.info(
            f"Errors: {batch_error_count} / {len(url_data)} ({error_percentage:0.2f}%)"
        )
        logger.info(f"Batch time: {timer.stop():0.2f} seconds")

        progress.update(len(url_data))
        batch_progress.close()

        json.dump(len(url_data), open(done_file_path, "w"))

    progress.close()
    logger.info("Done!")