def pickle_kd_tree(dataset_filename, pmc_pkl_filename, kdtree_pkl_filename): """ Main function. Times were measured on a "e2-highmem-4" Google Compute Engine instance (4 vCPUs, 32 GB memory). """ # Read paper_dataset file: ~3 minutes updater_log(f"Reading {dataset_filename} ...") df = pd.read_csv(dataset_filename, sep="\t").set_index("document") # Create pmc_map.pkl: less than 1 sec pmc_list = df.index.tolist() pmc_map = {row_num: pmc for row_num, pmc in enumerate(pmc_list)} updater_log("Pickle pmc_map ...") with open(pmc_pkl_filename, "wb") as fh: pickle.dump(pmc_map, fh) # Create KNN paper_model: ~8 minutes (total memory usage: ~30 GB) updater_log("Start KNN fitting") paper_model = KNeighborsClassifier(n_neighbors=10) paper_model.fit(df.drop("journal", axis=1), df.journal) # Pickle kd-tree: 1.5 minutes updater_log("Pickle kd_tree component ...") kd_tree = paper_model._tree with open(kdtree_pkl_filename, "wb") as fh: pickle.dump(kd_tree, fh) updater_log("kd_tree pickled") # Set output files read-only set_read_only(pmc_pkl_filename) set_read_only(kdtree_pkl_filename)
def process_tarball( tarball_filename, prev_pmc_list_filename, word_model_vector_filename, new_pmc_list_filename, new_embeddings_filename, new_token_counts_filename, ): """ Search new papers in an input tarball file, and save the new papers data on disk. """ updater_log(f"Processing '{tarball_filename}' ...") # Load word model vector from input pickled filename word_model_wv = pickle.load(open(word_model_vector_filename, "rb")) # Read previously processed PMC IDs into a set prev_pmc_list_df = pd.read_csv(prev_pmc_list_filename, sep="\t") prev_pmc_ids = set() for pmc_path in prev_pmc_list_df.file_path.tolist(): pmc_id = Path(pmc_path).stem prev_pmc_ids.add(pmc_id) tarball_basename = Path(tarball_filename).name with tarfile.open(tarball_filename, "r:gz") as tar_fh: # Write header lines into three output files with open(new_pmc_list_filename, 'w', newline='') as pmc_list_fh, \ open(new_embeddings_filename, 'w', newline='') as embeddings_fh, \ open(new_token_counts_filename, 'w', newline='') as token_counts_fh: pmc_list_writer = csv.DictWriter( pmc_list_fh, delimiter="\t", fieldnames=["tarfile", "file_path"]) pmc_list_writer.writeheader() embeddings_writer = csv.DictWriter( embeddings_fh, delimiter="\t", fieldnames=["journal", "document"] + [f"feat_{idx}" for idx in range(300)], ) embeddings_writer.writeheader() token_counts_writer = csv.DictWriter( token_counts_fh, delimiter="\t", fieldnames=["document", "lemma", "count"]) token_counts_writer.writeheader() write_data(word_model_wv, prev_pmc_ids, tarball_basename, tar_fh, pmc_list_writer, embeddings_writer, token_counts_writer) # Set output files read-only set_read_only(new_pmc_list_filename) set_read_only(new_embeddings_filename) set_read_only(new_token_counts_filename) updater_log(f"'{tarball_filename}' is done")
def ftp_download(url, destination): """ Download a single file that's hosted on an FTP server. Code is based on: https://stackoverflow.com/questions/11768214/ """ updater_log(f"Downloading {url} ...") with closing(request.urlopen(url)) as resp, open(destination, 'wb') as fh: shutil.copyfileobj(resp, fh)
def generate_vector_counts(word_model_wv, paper_fh): """ Parse a paper file (paper_fh) based on word model vector (word_model_wv). """ tree = ET.parse(paper_fh, parser=parser) # Process xml without specified tags ET.strip_tags(tree, *filter_tags) root = tree.getroot() # Skip non-research papers if root.attrib['article-type'].strip() != 'research-article': return [], None all_text = root.xpath(xpath) # a list of 'xml.etree._Element' instances all_text = list(map(lambda x: "".join(list(x.itertext())), all_text)) # all_text[idx].itertext() returns an instance of 'lxml.etree.ElementTextIterator'; # list(x.itertext()) returns a list of strings (including '\n'); # "".join(...) combines the list of strings into a single string; # map(...) returns an iterable of single string for each entry in all_text; # list(map(...)) converts the iterable of single string into a list of single string. # Combine all single strings together into ONE single string. all_text = " ".join(all_text) # Optimization: Remove stop words from `all_text` before feeding it to nlp. # This optimization not only speeds up the data processing 5%-10%, but also # minimizes memory usage. all_text = [x for x in all_text.split() if x not in stop_words] all_text = " ".join(all_text) # Set nlp.max_length dynamically if nlp.max_length < len(all_text): nlp.max_length = len(all_text) updater_log(f"set nlp.max_length to: {nlp.max_length}") all_tokens = list( map( lambda x: x.lemma_, filter( lambda tok: tok.lemma_ in word_model_wv and tok.lemma_ not in stop_words, nlp(all_text), ))) # Skip wonky papers that have less than 20 tokens if len(all_tokens) < 20: return [], None word_vectors = [word_model_wv[tok] for tok in all_tokens] return np.stack(word_vectors).mean(axis=0), Counter(all_tokens)
def merge_files( prev_pmc_list_filename, prev_embeddings_filename, prev_token_counts_filename, new_pmc_list_filename, new_embeddings_filename, new_token_counts_filename, merged_pmc_list_filename, merged_embeddings_filename, merged_token_counts_filename ): """ Merge new papers data files with the data files in previous run. """ updater_log("Merging pmc_list files ...") simple_merge( [prev_pmc_list_filename, new_pmc_list_filename], merged_pmc_list_filename ) updater_log("Merging embeddings files ...") simple_merge( [prev_embeddings_filename, new_embeddings_filename], merged_embeddings_filename ) updater_log("Merging token_counts files ...") simple_merge( [prev_token_counts_filename, new_token_counts_filename], merged_token_counts_filename ) updater_log("Finished merging files")
def update_paper_bins_stats(pmc_tsne_filename, embeddings_filename, token_counts_filename, pca_axes_filename, tmp_json_filename, final_json_filename, cutoff_score=20, debug=False): """ This function performs all the updates necessary for the frontend to work. It cycles through each bin and first calculates the cosine similarity between the bin and each PC. Next it calculates the odds ratio for each bin given the background. Lastly, it writes out all calculated statistics into a json file for the front end to use. Parameters: * pmc_tsne_filename: name of file that contains bin ID a paper is assigned to; * embeddings_filename: name of file that contains papers and their respective embeddings; * token_counts_filename: name of file that contains global token counters; * pca_axes_filename: name of file that contains the PC axes; * tmp_json_filename: input intermediate json filename; * final_json_filename: final output json filename (for front end); * cutoff_score: int; threshold to remove tokens; * debug: bool; if true, bin_counters pickle file will be saved too. """ # Read input paper landscape file and create a map from paper ID to bin_id all_paper_bins = pd.read_csv(pmc_tsne_filename, sep="\t") pmc_bin_mapper = dict( zip(all_paper_bins["document"].tolist(), all_paper_bins["squarebin_id"].tolist())) # Read embeddings into dictionary updater_log(f"Reading {embeddings_filename} (8~9 minutes) ...") bin_centroid = get_bin_centroid(embeddings_filename, pmc_bin_mapper) # Read global word counter file to get word counts in each bin: bin_counts = [ Counter() for bin_id in all_paper_bins["squarebin_id"].unique() ] updater_log(f"Reading {token_counts_filename} (~2.5 hours) ...") with open(token_counts_filename, newline='') as ifh: count_reader = csv.DictReader(ifh, delimiter="\t") for line in count_reader: token_count_entry = {line["lemma"]: int(line["count"])} bin_id = pmc_bin_mapper[line["document"]] bin_counts[bin_id].update(token_count_entry) # Filter each bin in `bin_counts` and calculate `total_counts` by getting # the sum of all bins in `bin_counts`. updater_log(f"Filtering bins and getting the total counts ...") total_counts = Counter() for bin_id, bin_data in enumerate(bin_counts): # Filter out low count tokens to speed function up filtered_bin_data = { lemma: counts for lemma, counts in bin_data.items() if counts > cutoff_score } if len(filtered_bin_data) > 0: bin_data = filtered_bin_data bin_counts[bin_id] = bin_data total_counts += bin_data # Get sum of background word counts total_sum = sum(total_counts.values()) # Grab the PCA axes pca_axes_df = pd.read_csv(pca_axes_filename, sep="\t") # Process all bins updater_log("Processing all square bins ...") bin_stat_records = list() output_dir = Path(final_json_filename).parent for bin_id, bin_data in enumerate(bin_counts): bin_result = process_bin(bin_id, bin_data, bin_centroid[bin_id], total_counts, total_sum, pca_axes_df, output_dir, debug=debug) bin_stat_records.append(bin_result) # Update JSON file and write it to disk square_plot_df = pd.read_json(tmp_json_filename) square_plot_df.merge(pd.DataFrame.from_records(bin_stat_records), on="bin_id").reset_index(drop=True).to_json( final_json_filename, orient="records", lines=False) # Set final output json file read-only set_read_only(final_json_filename)
orient="records", lines=False) # Set final output json file read-only set_read_only(final_json_filename) # Test harness if __name__ == "__main__": data_dir = './data/current_run/output/' # Input files pmc_tsne_filename = data_dir + 'pmc_tsne_square.tsv' embeddings_filename = data_dir + 'embeddings_full.tsv' token_counts_filename = data_dir + 'global_token_counts.tsv' pca_axes_filename = './data/static/pca_components.tsv' tmp_json_filename = data_dir + 'pmc_plot_tmp.json' # output file final_json_filename = data_dir + 'pmc_plot_final.json' updater_log("Start ...") update_paper_bins_stats(pmc_tsne_filename, embeddings_filename, token_counts_filename, pca_axes_filename, tmp_json_filename, final_json_filename, debug=True)
import sys from pathlib import Path from bin_stats_updater import update_paper_bins_stats from downloader import download_xml_files from json_minimizer import minimize_json from journal_centroid import generate_journal_centroid from kd_tree_creator import pickle_kd_tree from merger import merge_files from paper_parser import parse_new_papers from saucie_coordinates import generate_saucie_coordinates from utils import updater_log # Main program if __name__ == "__main__": updater_log("Python auto-updater pipeline started") # Get name of the directory that this script is located: parent_dir = Path(__file__).resolve().parent # Static data dir static_data_dir = Path(parent_dir, 'data', 'static') # Static data files that will be read by each updater run pca_axes_filename = Path(static_data_dir, 'pca_components.tsv') word_model_vector_filename = Path(static_data_dir, 'word_model.wv.pkl') # Input/output data directory for current run current_data_dir = Path(parent_dir, 'data', 'current_run') # Input dir