Ejemplo n.º 1
0
def pickle_kd_tree(dataset_filename, pmc_pkl_filename, kdtree_pkl_filename):
    """
    Main function.  Times were measured on a "e2-highmem-4" Google
    Compute Engine instance (4 vCPUs, 32 GB memory).
    """

    # Read paper_dataset file: ~3 minutes
    updater_log(f"Reading {dataset_filename} ...")
    df = pd.read_csv(dataset_filename, sep="\t").set_index("document")

    # Create pmc_map.pkl: less than 1 sec
    pmc_list = df.index.tolist()
    pmc_map = {row_num: pmc for row_num, pmc in enumerate(pmc_list)}
    updater_log("Pickle pmc_map ...")
    with open(pmc_pkl_filename, "wb") as fh:
        pickle.dump(pmc_map, fh)

    # Create KNN paper_model: ~8 minutes (total memory usage: ~30 GB)
    updater_log("Start KNN fitting")
    paper_model = KNeighborsClassifier(n_neighbors=10)
    paper_model.fit(df.drop("journal", axis=1), df.journal)

    # Pickle kd-tree: 1.5 minutes
    updater_log("Pickle kd_tree component ...")
    kd_tree = paper_model._tree
    with open(kdtree_pkl_filename, "wb") as fh:
        pickle.dump(kd_tree, fh)

    updater_log("kd_tree pickled")

    # Set output files read-only
    set_read_only(pmc_pkl_filename)
    set_read_only(kdtree_pkl_filename)
def process_tarball(
    tarball_filename,
    prev_pmc_list_filename,
    word_model_vector_filename,
    new_pmc_list_filename,
    new_embeddings_filename,
    new_token_counts_filename,
):
    """
    Search new papers in an input tarball file, and save the new papers
    data on disk.
    """
    updater_log(f"Processing '{tarball_filename}' ...")

    # Load word model vector from input pickled filename
    word_model_wv = pickle.load(open(word_model_vector_filename, "rb"))

    # Read previously processed PMC IDs into a set
    prev_pmc_list_df = pd.read_csv(prev_pmc_list_filename, sep="\t")
    prev_pmc_ids = set()
    for pmc_path in prev_pmc_list_df.file_path.tolist():
        pmc_id = Path(pmc_path).stem
        prev_pmc_ids.add(pmc_id)

    tarball_basename = Path(tarball_filename).name
    with tarfile.open(tarball_filename, "r:gz") as tar_fh:
        # Write header lines into three output files
        with open(new_pmc_list_filename, 'w', newline='') as pmc_list_fh, \
         open(new_embeddings_filename, 'w', newline='') as embeddings_fh, \
         open(new_token_counts_filename, 'w', newline='') as token_counts_fh:
            pmc_list_writer = csv.DictWriter(
                pmc_list_fh,
                delimiter="\t",
                fieldnames=["tarfile", "file_path"])
            pmc_list_writer.writeheader()

            embeddings_writer = csv.DictWriter(
                embeddings_fh,
                delimiter="\t",
                fieldnames=["journal", "document"] +
                [f"feat_{idx}" for idx in range(300)],
            )
            embeddings_writer.writeheader()

            token_counts_writer = csv.DictWriter(
                token_counts_fh,
                delimiter="\t",
                fieldnames=["document", "lemma", "count"])
            token_counts_writer.writeheader()

            write_data(word_model_wv, prev_pmc_ids, tarball_basename, tar_fh,
                       pmc_list_writer, embeddings_writer, token_counts_writer)

    # Set output files read-only
    set_read_only(new_pmc_list_filename)
    set_read_only(new_embeddings_filename)
    set_read_only(new_token_counts_filename)

    updater_log(f"'{tarball_filename}' is done")
def ftp_download(url, destination):
    """
    Download a single file that's hosted on an FTP server.
    Code is based on: https://stackoverflow.com/questions/11768214/
    """

    updater_log(f"Downloading {url} ...")
    with closing(request.urlopen(url)) as resp, open(destination, 'wb') as fh:
            shutil.copyfileobj(resp, fh)
def generate_vector_counts(word_model_wv, paper_fh):
    """
    Parse a paper file (paper_fh) based on word model vector (word_model_wv).
    """

    tree = ET.parse(paper_fh, parser=parser)

    # Process xml without specified tags
    ET.strip_tags(tree, *filter_tags)
    root = tree.getroot()

    # Skip non-research papers
    if root.attrib['article-type'].strip() != 'research-article':
        return [], None

    all_text = root.xpath(xpath)  # a list of 'xml.etree._Element' instances

    all_text = list(map(lambda x: "".join(list(x.itertext())), all_text))
    # all_text[idx].itertext() returns an instance of 'lxml.etree.ElementTextIterator';
    # list(x.itertext()) returns a list of strings (including '\n');
    # "".join(...) combines the list of strings into a single string;
    # map(...) returns an iterable of single string for each entry in all_text;
    # list(map(...)) converts the iterable of single string into a list of single string.

    # Combine all single strings together into ONE single string.
    all_text = " ".join(all_text)

    # Optimization: Remove stop words from `all_text` before feeding it to nlp.
    # This optimization not only speeds up the data processing 5%-10%, but also
    # minimizes memory usage.
    all_text = [x for x in all_text.split() if x not in stop_words]
    all_text = " ".join(all_text)

    # Set nlp.max_length dynamically
    if nlp.max_length < len(all_text):
        nlp.max_length = len(all_text)
        updater_log(f"set nlp.max_length to: {nlp.max_length}")

    all_tokens = list(
        map(
            lambda x: x.lemma_,
            filter(
                lambda tok: tok.lemma_ in word_model_wv and tok.lemma_ not in
                stop_words,
                nlp(all_text),
            )))

    # Skip wonky papers that have less than 20 tokens
    if len(all_tokens) < 20:
        return [], None

    word_vectors = [word_model_wv[tok] for tok in all_tokens]

    return np.stack(word_vectors).mean(axis=0), Counter(all_tokens)
Ejemplo n.º 5
0
def merge_files(
        prev_pmc_list_filename,
        prev_embeddings_filename,
        prev_token_counts_filename,
        new_pmc_list_filename,
        new_embeddings_filename,
        new_token_counts_filename,
        merged_pmc_list_filename,
        merged_embeddings_filename,
        merged_token_counts_filename
):
    """
    Merge new papers data files with the data files in previous run.
    """

    updater_log("Merging pmc_list files ...")

    simple_merge(
        [prev_pmc_list_filename, new_pmc_list_filename],
        merged_pmc_list_filename
    )

    updater_log("Merging embeddings files ...")

    simple_merge(
        [prev_embeddings_filename, new_embeddings_filename],
        merged_embeddings_filename
    )

    updater_log("Merging token_counts files ...")

    simple_merge(
        [prev_token_counts_filename, new_token_counts_filename],
        merged_token_counts_filename
    )

    updater_log("Finished merging files")
Ejemplo n.º 6
0
def update_paper_bins_stats(pmc_tsne_filename,
                            embeddings_filename,
                            token_counts_filename,
                            pca_axes_filename,
                            tmp_json_filename,
                            final_json_filename,
                            cutoff_score=20,
                            debug=False):
    """
    This function performs all the updates necessary for the frontend to
    work. It cycles through each bin and first calculates the cosine
    similarity between the bin and each PC.  Next it calculates the odds
    ratio for each bin given the background.  Lastly, it writes out all
    calculated statistics into a json file for the front end to use.

    Parameters:
      * pmc_tsne_filename: name of file that contains bin ID a paper is
        assigned to;
      * embeddings_filename: name of file that contains papers and their
        respective embeddings;
      * token_counts_filename: name of file that contains global token
        counters;
      * pca_axes_filename: name of file that contains the PC axes;
      * tmp_json_filename: input intermediate json filename;
      * final_json_filename: final output json filename (for front end);
      * cutoff_score: int; threshold to remove tokens;
      * debug: bool; if true, bin_counters pickle file will be saved too.
    """

    # Read input paper landscape file and create a map from paper ID to bin_id
    all_paper_bins = pd.read_csv(pmc_tsne_filename, sep="\t")
    pmc_bin_mapper = dict(
        zip(all_paper_bins["document"].tolist(),
            all_paper_bins["squarebin_id"].tolist()))

    # Read embeddings into dictionary
    updater_log(f"Reading {embeddings_filename} (8~9 minutes) ...")
    bin_centroid = get_bin_centroid(embeddings_filename, pmc_bin_mapper)

    # Read global word counter file to get word counts in each bin:
    bin_counts = [
        Counter() for bin_id in all_paper_bins["squarebin_id"].unique()
    ]

    updater_log(f"Reading {token_counts_filename} (~2.5 hours) ...")
    with open(token_counts_filename, newline='') as ifh:
        count_reader = csv.DictReader(ifh, delimiter="\t")
        for line in count_reader:
            token_count_entry = {line["lemma"]: int(line["count"])}
            bin_id = pmc_bin_mapper[line["document"]]
            bin_counts[bin_id].update(token_count_entry)

    # Filter each bin in `bin_counts` and calculate `total_counts` by getting
    # the sum of all bins in `bin_counts`.
    updater_log(f"Filtering bins and getting the total counts ...")
    total_counts = Counter()
    for bin_id, bin_data in enumerate(bin_counts):
        # Filter out low count tokens to speed function up
        filtered_bin_data = {
            lemma: counts
            for lemma, counts in bin_data.items() if counts > cutoff_score
        }

        if len(filtered_bin_data) > 0:
            bin_data = filtered_bin_data
            bin_counts[bin_id] = bin_data

        total_counts += bin_data

    # Get sum of background word counts
    total_sum = sum(total_counts.values())

    # Grab the PCA axes
    pca_axes_df = pd.read_csv(pca_axes_filename, sep="\t")

    # Process all bins
    updater_log("Processing all square bins ...")
    bin_stat_records = list()
    output_dir = Path(final_json_filename).parent
    for bin_id, bin_data in enumerate(bin_counts):
        bin_result = process_bin(bin_id,
                                 bin_data,
                                 bin_centroid[bin_id],
                                 total_counts,
                                 total_sum,
                                 pca_axes_df,
                                 output_dir,
                                 debug=debug)
        bin_stat_records.append(bin_result)

    # Update JSON file and write it to disk
    square_plot_df = pd.read_json(tmp_json_filename)

    square_plot_df.merge(pd.DataFrame.from_records(bin_stat_records),
                         on="bin_id").reset_index(drop=True).to_json(
                             final_json_filename,
                             orient="records",
                             lines=False)

    # Set final output json file read-only
    set_read_only(final_json_filename)
Ejemplo n.º 7
0
                             orient="records",
                             lines=False)

    # Set final output json file read-only
    set_read_only(final_json_filename)


# Test harness
if __name__ == "__main__":
    data_dir = './data/current_run/output/'

    # Input files
    pmc_tsne_filename = data_dir + 'pmc_tsne_square.tsv'
    embeddings_filename = data_dir + 'embeddings_full.tsv'
    token_counts_filename = data_dir + 'global_token_counts.tsv'
    pca_axes_filename = './data/static/pca_components.tsv'
    tmp_json_filename = data_dir + 'pmc_plot_tmp.json'

    # output file
    final_json_filename = data_dir + 'pmc_plot_final.json'

    updater_log("Start ...")

    update_paper_bins_stats(pmc_tsne_filename,
                            embeddings_filename,
                            token_counts_filename,
                            pca_axes_filename,
                            tmp_json_filename,
                            final_json_filename,
                            debug=True)
Ejemplo n.º 8
0
import sys
from pathlib import Path

from bin_stats_updater import update_paper_bins_stats
from downloader import download_xml_files
from json_minimizer import minimize_json
from journal_centroid import generate_journal_centroid
from kd_tree_creator import pickle_kd_tree
from merger import merge_files
from paper_parser import parse_new_papers
from saucie_coordinates import generate_saucie_coordinates
from utils import updater_log

# Main program
if __name__ == "__main__":
    updater_log("Python auto-updater pipeline started")

    # Get name of the directory that this script is located:
    parent_dir = Path(__file__).resolve().parent

    # Static data dir
    static_data_dir = Path(parent_dir, 'data', 'static')

    # Static data files that will be read by each updater run
    pca_axes_filename = Path(static_data_dir, 'pca_components.tsv')
    word_model_vector_filename = Path(static_data_dir, 'word_model.wv.pkl')

    # Input/output data directory for current run
    current_data_dir = Path(parent_dir, 'data', 'current_run')

    # Input dir