def compute_similarity_matrix_ngram_parallel( *, repr_vocab, full_vocab, processes, n, ngram_to_index, ) -> np.ndarray: """ :param repr_vocab: :param full_vocab: :param processes: :param n: :param ngram_to_index: :return: """ from ratvec.similarity import n_gram_sim_list secho( f"Splitting data for computing similarities in {processes} processes") elements = get_ngram_elements( full_vocab=full_vocab, repr_vocab=repr_vocab, ngram_to_index=ngram_to_index, n=n, ) compute_similarities_on_splits = partial(n_gram_sim_list, n_ngram=n) return _calculate_similarity_matrix_parallel( full_vocab=full_vocab, repr_vocab=repr_vocab, processes=processes, elements=elements, compute_similarities_on_splits=compute_similarities_on_splits, )
def _run_evaluation( *, y, save_dataset, family_labels, n_components, n_iterations, max_neighbors, pool, subdirectory, ) -> None: kpca = os.path.join(subdirectory, 'kpca.npy') secho(f'Loading embeddings file: {kpca}') x = np.load(kpca) balanced_datasets, counts = make_balanced(x, y) if save_dataset: family_labels_balanced_path = os.path.join(subdirectory, family_labels.name + "_balanced") with open(family_labels_balanced_path, "wb") as file: pickle.dump((balanced_datasets, counts), file) _sub_run_evaluation( balanced_datasets=balanced_datasets, counts=counts, n_components=n_components, n_iterations=n_iterations, max_neighbors=max_neighbors, pool=pool, subdirectory=subdirectory, )
def main( family_labels, directory, n_components: int, max_neighbors: int, n_iterations: int, no_save_dataset: bool, load_dataset: bool, ) -> None: """Evaluate KPCA embeddings.""" with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool: if load_dataset: click.echo('Loading balanced datasets') subdirectory = os.path.dirname(family_labels.name) with open(family_labels.name + "_balanced", "rb") as file: balanced_datasets, counts = pickle.load(file) _sub_run_evaluation( balanced_datasets=balanced_datasets, counts=counts, n_components=n_components, n_iterations=n_iterations, max_neighbors=max_neighbors, pool=pool, subdirectory=subdirectory, ) else: secho(f'Loading family labels file: {family_labels}') y = np.array([ l[:-1] for l in family_labels ]) optim_dir = os.path.join(directory, 'optim') os.makedirs(optim_dir, exist_ok=True) secho(f'Dynamically generating balanced datasets from {optim_dir}') for subdirectory_name in os.listdir(optim_dir): subdirectory = os.path.join(optim_dir, subdirectory_name) if not os.path.isdir(subdirectory): continue secho(f'Handling {subdirectory}') _run_evaluation( y=y, save_dataset=(not no_save_dataset), family_labels=family_labels, n_components=n_components, n_iterations=n_iterations, max_neighbors=max_neighbors, pool=pool, subdirectory=subdirectory, ) secho(f"done. Enjoy your {make_ratvec(3)}")
def infer( full_sim_matrix_file: str, repr_sim_matrix_file: str, output: str, n_components: int, sim: str, use_gpu: bool, ): """Load pre-computed similarity matrix.""" secho( f"Loading the repr similarity matrix for the full vocabulary to {repr_sim_matrix_file}" ) repr_similarity_matrix = np.load(repr_sim_matrix_file) secho( f"Loading the full similarity matrix for the full vocabulary to {full_sim_matrix_file}" ) full_similarity_matrix = np.load(full_sim_matrix_file) optim_folder = os.path.join(output, 'optim') os.makedirs(optim_folder, exist_ok=True) optimize_projections( output=optim_folder, repr_similarity_matrix=repr_similarity_matrix, full_similarity_matrix=full_similarity_matrix, n_components=n_components, similarity_type=sim, use_gpu=use_gpu, ) if use_gpu: # only shut down after all loops have used this function import cudamat as cm cm.shutdown() secho(f"done. Enjoy your {make_ratvec(3)}")
def _calculate_similarity_matrix_parallel( *, full_vocab, repr_vocab, processes, elements, compute_similarities_on_splits: Callable, ) -> np.ndarray: full_vocab_len = len(full_vocab) repr_vocab_len = len(repr_vocab) split_size = ceil((full_vocab_len * repr_vocab_len) / processes) splits: List[List[Any]] = compute_splits( elements=elements, split_size=split_size, processes=processes, ) secho(f'Computing similarities in {processes} processes') with multiprocessing.Pool(processes=processes) as pool: res = pool.map(compute_similarities_on_splits, splits) res = np.hstack(res) return res.reshape(full_vocab_len, repr_vocab_len)
def main(directory: str, force: bool): """Generate the protein vocabularies.""" # Ensure data directory exists make_data_directory() sequences_path = os.path.join(directory, PROTEIN_FAMILY_SEQUENCES) metadata_path = os.path.join(directory, PROTEIN_FAMILY_METADATA) if not force and os.path.isfile(sequences_path) and os.path.isfile(metadata_path): secho(f"Files are already existing in {directory}. Use --force to re-compute.") sys.exit(0) secho(f"Downloading files from the internet. Please be patient.") # Download the protein files download_protein_files(directory) generate_protein_vocabularies(directory, directory) secho(f"done. Enjoy your {make_ratvec(3)}")
def main( directory, n_components: int, max_neighbors: int, n_iterations: int, ) -> None: """Evaluate KPCA embeddings.""" with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool: optim_dir = os.path.join(directory, 'optim') os.makedirs(optim_dir, exist_ok=True) for subdirectory_name in os.listdir(optim_dir): subdirectory = os.path.join(optim_dir, subdirectory_name) if not os.path.isdir(subdirectory): continue secho(f'Handling {subdirectory}') kpca = os.path.join(subdirectory, 'kpca.npy') secho(f'Loading embeddings file: {kpca}') X = np.load(kpca) n_pos_seqs = int(X.shape[0] / 2) n_neg_seqs = n_pos_seqs y = np.array(n_pos_seqs * [True] + n_neg_seqs * [False]) balanced_datasets = [(X, y)] counts = [len(y)] _sub_run_evaluation( balanced_datasets=balanced_datasets, counts=counts, n_components=n_components, n_iterations=n_iterations, max_neighbors=max_neighbors, pool=pool, subdirectory=subdirectory, ) secho(f"done. Enjoy your {make_ratvec(3)}")
def generate_protein_vocabularies(source_directory: str, output_directory: str) -> None: """Use the data in the source directory to pre-compute files for RatVec.""" metadata_path = os.path.join(source_directory, PROTEIN_FAMILY_METADATA) seq_path = os.path.join(source_directory, PROTEIN_FAMILY_SEQUENCES) vocab_file_path = os.path.join(source_directory, "X.txt") labels_file_path = os.path.join(source_directory, "Y.txt") secho(f'Reading labels from {metadata_path}', fg="cyan") with codecs.open(metadata_path) as file: _ = next(file) # skip the header # Parse each line to get the protein name protein_names = np.array([line[:-1].split("\t")[-2] for line in file]) number_of_proteins = len(protein_names) # Ensure the number of proteins is 323018 assert number_of_proteins == 324018, 'Wrong number of protein sequences' # Get a list from 0 to number of proteins and shuffle it idx = list(range(number_of_proteins)) shuffle(idx) protein_names = protein_names[idx] assert len(set(protein_names)) == 7027, 'Wrong number of protein families' secho(f'Reading sequences from {seq_path}', fg="cyan") with codecs.open(seq_path) as file: _ = next(file) # Skip the header seqs = np.array([line[:-1] for line in file]) seqs = seqs[idx] # Use of a dictionary to remove duplicated sequences secho(f'Removing duplicates', fg="cyan") secho( f'Number of sequences before removing duplicates {number_of_proteins}', fg="cyan") dataset = {seqs[i]: protein_names[i] for i in idx} secho(f'Number of sequences after removing duplicates {len(dataset)}', fg="cyan") # Free up memory del seqs del protein_names # Get the keys and values of the cleaned up dictionary with no duplicates x, y = np.array(list(dataset.keys())), np.array(list(dataset.values())) secho(f'Saving vocabulary to {vocab_file_path}', fg="cyan") with codecs.open(vocab_file_path, "w") as file: # Store the sequences into X.txt file.write("\n".join(x)) secho(f'Saving labels to to {labels_file_path}', fg="cyan") with codecs.open(labels_file_path, "w") as file: file.write("\n".join(y)) # Make a counter to get the representative vocabularies label_counter = Counter(y) d = np.array([(key, label_counter[key]) for key in label_counter]) d_sorted = sorted(d, key=lambda tup: float(tup[1])) preset_lengths = 100, 200, 500, 1000, 2000, 3000, 4000 length_to_subdirectory = { length: os.path.join(output_directory, str(length)) for length in preset_lengths } length_to_subdirectory[len(set(y))] = os.path.join(output_directory, 'full') with open(os.path.join(output_directory, 'manifest.json'), 'w') as file: json.dump( [ dict(length=length, subdirectory=subdirectory) for length, subdirectory in length_to_subdirectory.items() ], file, indent=2, ) secho( f'Processing for lengths: {", ".join(map(str, sorted(length_to_subdirectory)))}' ) for length, subdirectory in length_to_subdirectory.items(): os.makedirs(subdirectory, exist_ok=True) secho(f'Processing top {length} in {subdirectory}', fg="cyan") top_labels = [t[0] for t in d_sorted[-length:]] idx_top = [l in top_labels for l in y] y_top = y[idx_top] x_top = x[idx_top] top_n_labels_path = os.path.join(subdirectory, "labels.txt") with codecs.open(top_n_labels_path, "w") as file: file.write("\n".join(y_top)) top_n_vocab_path = os.path.join(subdirectory, "full_vocab.txt") with codecs.open(top_n_vocab_path, "w") as file: file.write("\n".join(x_top)) single_family_representatives = [] for family in top_labels: family_idx = np.where(y == family) # FIXME why does this look for the index, then just get the value? Why not just do # min(x[family_idx], key=len) repr_idx = np.argmin([len(s) for s in x[family_idx]]) single_family_representatives.append(x[family_idx][repr_idx]) repr_top_n_path = os.path.join(subdirectory, "repr_vocab.txt") with codecs.open(repr_top_n_path, "w") as file: file.write("\n".join(single_family_representatives))
def optimize_projections( *, output: str, repr_similarity_matrix, full_similarity_matrix, n_components: int, similarity_type: str, use_gpu: bool, ) -> None: """ :param output: The output folder :param repr_similarity_matrix: A square matrix with dimensions |repr| x |repr| :param full_similarity_matrix: A rectangular matrix with dimensions |full| x |repr| :param n_components: :return: """ khc = ((kernel_name, KERNEL_TO_PROJECTION[kernel_name], hyperparam) for kernel_name, hyperparams in kernels.items() for hyperparam in hyperparams) for kernel_name, project_with_kernel, hyperparam in khc: # Make output folder for the optimization with this kernel/hyper-parameter pair param_folder = os.path.join(output, f'{kernel_name}_{hyperparam}') os.makedirs(param_folder, exist_ok=True) secho( f"({kernel_name}/{hyperparam}) calculating normalized/symmetric kernel matrix" ) repr_kernel_matrix = project_with_kernel(repr_similarity_matrix, hyperparam) repr_kernel_matrix_normalized = normalize_kernel_matrix( repr_kernel_matrix) secho( f"({kernel_name}/{hyperparam}) solving eigenvector/eigenvalues problem" ) eigenvalues, eigenvectors = eigh(repr_kernel_matrix_normalized) # Calculate alphas repr_alphas = np.column_stack( [eigenvectors[:, -i] for i in range(1, n_components + 1)]) # Save Alphas _alphas_path = os.path.join(param_folder, f"alphas.p") secho( f"({kernel_name}/{hyperparam}) outputting alphas to {_alphas_path}" ) with open(_alphas_path, "wb") as file: pickle.dump(repr_alphas, file) # Calculate lambdas repr_lambdas = [eigenvalues[-i] for i in range(1, n_components + 1)] # Save lambdas _lambdas_path = os.path.join(param_folder, f"lambdas.p") secho( f"({kernel_name}/{hyperparam}) outputting lambdas to {_lambdas_path}" ) with open(_lambdas_path, 'wb') as file: pickle.dump(repr_lambdas, file) secho( f"({kernel_name}/{hyperparam}) projecting known vocabulary to KPCA embeddings" ) repr_projection_matrix = repr_alphas / repr_lambdas # Calculate KPCA matrix if similarity_type == "ngram_intersec": # There is no additional kernel function on top of the similarity function kpca_matrix = project_full_vocab_linear( projection_matrix=repr_projection_matrix, similarity_matrix=full_similarity_matrix, ) elif use_gpu: kpca_matrix = project_words_gpu( projection_matrix=repr_projection_matrix, similarity_matrix=full_similarity_matrix, kernel_name=kernel_name, hyperparam=hyperparam, ) else: kpca_matrix = project_similarity_matrix( projection_matrix=repr_projection_matrix, similarity_matrix=full_similarity_matrix, kernel_name=kernel_name, hyperparam=hyperparam, ) # Save KPCA matrix _kpca_path = os.path.join(param_folder, f"kpca.npy") secho( f"({kernel_name}/{hyperparam}) outputting KPCA matrix to {_kpca_path}" ) np.save(_kpca_path, kpca_matrix)
def main( full_vocab_file: str, repr_vocab_file: str, output: str, n_components: int, sim: str, sim_alignment_matrix: str, n_ngram: int, use_gpu: bool, processes: int, ) -> None: """Compute KPCA embeddings on a given data set.""" n = n_ngram # meh output = os.path.abspath(output) os.makedirs(output, exist_ok=True) full_vocab = _preprocess_vocab_file(full_vocab_file) if repr_vocab_file is None: repr_vocab = full_vocab else: repr_vocab = _preprocess_vocab_file(repr_vocab_file) params_path = os.path.join(output, 'training_manifest.json') secho(f'Outputting training information to {params_path}') manifest = dict( sim=sim, n=n, len_full_vocab=len(full_vocab), len_repr_vocab=len(repr_vocab), kernels=kernels, ) with open(params_path, 'w') as file: json.dump(manifest, file, sort_keys=True, indent=2) if use_gpu: import cudamat as cm cm.cublas_init() if sim == 'global-alignment': secho( f'Computing global alignment similarities with {sim_alignment_matrix}' ) repr_similarity_matrix = calculate_global_alignment_similarity_matrix( full_vocab=repr_vocab, repr_vocab=repr_vocab, processes=processes, matrix=sim_alignment_matrix, tqdm_desc=f'{EMOJI} Computing self-similarity matrix for ' f'repr vocab with global alignment ({sim_alignment_matrix})') full_similarity_matrix = calculate_global_alignment_similarity_matrix( full_vocab=full_vocab, repr_vocab=repr_vocab, processes=processes, matrix=sim_alignment_matrix, tqdm_desc=f'{EMOJI} Computing similarity matrix between ' f'full/repr vocab with global alignment ({sim_alignment_matrix})') else: alphabet = set(itt.chain.from_iterable(repr_vocab)) alphabet.add(" ") ngram_to_index = { ngram: i for i, ngram in enumerate( ["".join(t) for t in itt.product(alphabet, repeat=n)]) } if sim == "ngram_intersec": secho(f'Computing n-gram sparse similarities with {sim}') repr_similarity_matrix = compute_similarity_matrix_ngram_sparse( full_vocab=repr_vocab, repr_vocab=repr_vocab, ngram_to_index=ngram_to_index, n=n, ) full_similarity_matrix = compute_similarity_matrix_ngram_sparse( full_vocab=full_vocab, repr_vocab=repr_vocab, ngram_to_index=ngram_to_index, n=n, ) else: # sim == 'ngram_sim' secho(f'Computing n-gram similarities with {sim}') repr_similarity_matrix = compute_similarity_matrix_ngram_parallel( full_vocab=repr_vocab, repr_vocab=repr_vocab, n=n, ngram_to_index=ngram_to_index, processes=processes, # Extra because this gets multi-processed ) full_similarity_matrix = compute_similarity_matrix_ngram_parallel( full_vocab=full_vocab, repr_vocab=repr_vocab, n=n, ngram_to_index=ngram_to_index, processes=processes, # Extra because this gets multi-processed ) repr_similarity_matrix_path = os.path.join(output, f"repr_similarity_matrix.npy") secho( f"Saving the repr similarity matrix for the full vocabulary to {repr_similarity_matrix_path}" ) np.save(repr_similarity_matrix_path, repr_similarity_matrix, allow_pickle=False) full_similarity_matrix_path = os.path.join(output, f"full_similarity_matrix.npy") secho( f"Saving the full similarity matrix for the full vocabulary to {full_similarity_matrix_path}" ) np.save(full_similarity_matrix_path, full_similarity_matrix, allow_pickle=False) optim_folder = os.path.join(output, 'optim') os.makedirs(optim_folder, exist_ok=True) if n_components is None: n_components = int(0.5 + len(repr_vocab) * 2 / 3) optimize_projections( output=optim_folder, repr_similarity_matrix=repr_similarity_matrix, full_similarity_matrix=full_similarity_matrix, n_components=n_components, similarity_type=sim, use_gpu=use_gpu, ) if use_gpu: # only shut down after all loops have used this function import cudamat as cm cm.shutdown() secho(f"done. Enjoy your {make_ratvec(3)}")
def _sub_run_evaluation( *, balanced_datasets, counts, n_components, n_iterations, max_neighbors, pool, subdirectory, ): with open(os.path.join(subdirectory, 'evaluation_params.json'), 'w') as file: json.dump( dict( components=n_components, iterations=n_iterations, max_neighbors=max_neighbors, ), file, indent=2, ) filt_counts = [family_size for family_size in counts if family_size >= 10] secho("Exploring different number of components") number_components_grid_search_results = {} number_components_low = 1 number_components_high = int(n_components) it = tqdm( range( number_components_low, number_components_high, max( 1, int( np.floor((number_components_high - number_components_low) / n_iterations))), ), desc=f'{EMOJI} Optimizing number of components', ) it.write('Number Components\tMean CV Score') for reduced_n_components in it: n_neighbors = 1 partial_eval_function = partial( score_overview, reduced_n_components, n_neighbors, ) best_mean_score, _, _ = np.array( pool.starmap(partial_eval_function, balanced_datasets))[0] it.write(f"{reduced_n_components}\t{best_mean_score:.3f}") number_components_grid_search_results[ reduced_n_components] = best_mean_score best_number_components = max( number_components_grid_search_results, key=number_components_grid_search_results.get, ) best_result1 = number_components_grid_search_results[ best_number_components] secho( f"Best at components={best_number_components}, score={best_result1:.3f}" ) secho("Exploring different number of neighbors") number_neighbors_grid_search_results = {} it = tqdm(range(1, max_neighbors), desc=f'{EMOJI} Optimizing number of neighbors') for n_neighbors in it: partial_eval_function = partial( score_overview, best_number_components, n_neighbors, ) best_mean_score, _, _ = np.array( pool.starmap(partial_eval_function, balanced_datasets))[0] it.write(f"{n_neighbors}\t{best_mean_score:.3f}\b") number_neighbors_grid_search_results[n_neighbors] = best_mean_score best_number_neighbors = max(number_neighbors_grid_search_results, key=number_neighbors_grid_search_results.get) best_result2 = number_neighbors_grid_search_results[best_number_neighbors] secho( f"Best at neighbors={best_number_neighbors}, score={best_result2:.3f}") mean_score, pos_score, neg_score = score_overview(best_number_components, best_number_neighbors, balanced_datasets[0][0], balanced_datasets[0][1]) secho( f"10-fold-crossvalidation accuracy on positive examples={pos_score:.3f}" ) secho( f"10-fold-crossvalidation accuracy on negative examples={neg_score:.3f}" ) secho(f"Overall 10-fold-crossvalidation accuracy {mean_score:.3f}") with open(os.path.join(subdirectory, 'evaluation_results.json'), 'w') as file: json.dump( { 'number_components_grid_search': { 'best_number_components': best_number_components, 'results': number_components_grid_search_results, }, 'number_neighbors_grid_search': { 'best_number_neighbors': best_number_neighbors, 'results': number_neighbors_grid_search_results, }, }, file, indent=2, )
def _sub_run_evaluation( *, balanced_datasets, counts, n_components, n_iterations, max_neighbors, pool, subdirectory, ): with open(os.path.join(subdirectory, 'evaluation_params.json'), 'w') as file: json.dump( dict( components=n_components, iterations=n_iterations, max_neighbors=max_neighbors, ), file, indent=2, ) filt_counts = [ family_size for family_size in counts if family_size >= 10 ] secho("Exploring different number of components") number_components_grid_search_results = {} number_components_low = 1 number_components_high = int(n_components) it = tqdm( range( number_components_low, number_components_high, max(1, int(np.floor((number_components_high - number_components_low) / n_iterations))), ), desc=f'{EMOJI} Optimizing number of components', ) it.write('Number Components\tMean CV Score') for reduced_n_components in it: n_neighbors = 1 partial_eval_function = partial( plos_cross_val_score, reduced_n_components, n_neighbors, ) plos_scores = np.array(pool.starmap(partial_eval_function, balanced_datasets)) weighted_score = np.dot(plos_scores, filt_counts) / np.sum(filt_counts) it.write(f"{reduced_n_components}\t{weighted_score:.3f}") number_components_grid_search_results[reduced_n_components] = weighted_score best_number_components = max( number_components_grid_search_results, key=number_components_grid_search_results.get, ) best_result1 = number_components_grid_search_results[best_number_components] secho(f"Best at components={best_number_components}, score={best_result1:.3f}") secho("Exploring different number of neighbors") number_neighbors_grid_search_results = {} it = tqdm(range(1, max_neighbors), desc=f'{EMOJI} Optimizing number of neighbors') for n_neighbors in it: partial_eval_function = partial( plos_cross_val_score, best_number_components, n_neighbors, ) plos_scores = np.array(pool.starmap(partial_eval_function, balanced_datasets)) weighted_score = np.dot(plos_scores, filt_counts) / np.sum(filt_counts) it.write(f"{n_neighbors}\t{weighted_score:.3f}\b") number_neighbors_grid_search_results[n_neighbors] = weighted_score best_number_neighbors = max(number_neighbors_grid_search_results, key=number_neighbors_grid_search_results.get) best_result2 = number_neighbors_grid_search_results[best_number_neighbors] secho(f"Best at neighbors={best_number_neighbors}, score={best_result2:.3f}") with open(os.path.join(subdirectory, 'evaluation_results.json'), 'w') as file: json.dump( { 'number_components_grid_search': { 'best_number_components': best_number_components, 'results': number_components_grid_search_results, }, 'number_neighbors_grid_search': { 'best_number_neighbors': best_number_neighbors, 'results': number_neighbors_grid_search_results, }, }, file, indent=2, )