def get_ppi_results(ppi_graph_path: str, dataset: str, evaluation: str = 'cv', assoc_path: str = None, phewas: str = None) -> pd.DataFrame: dge_params = dge_params_ad if dataset in AD_DGE_DATASETS else dge_params_dis gene_list = parse_dge(dge_path=dge_file(dataset), entrez_id_header=dge_params['id'], log2_fold_change_header=dge_params['l2f'], adj_p_header=dge_params['adjp'], entrez_delimiter=split_char, base_mean_header=dge_params['mean'], csv_separator=';') network = generate_ppi_network( ppi_graph_path=ppi_graph_path, dge_list=gene_list, max_adj_p=max_padj, max_log2_fold_change=max_log2_fold_change, min_log2_fold_change=min_log2_fold_change, ppi_edge_min_confidence=ppi_edge_min_confidence, current_disease_ids_path='', disease_associations_path=phewas, ) logger.info(f'Nodes {len(network.graph.vs)}') targets = parse_gene_list(targets_file(dataset_to_disease_abv(dataset)), network) assoc_score = assoc_path and parse_association_scores(assoc_path) logger.debug( f'Number of targets being used for the network: {len(targets)}') write_gat2vec_input_files( network=network, targets=targets, home_dir=g2v_path, assoc_score=assoc_score, ) metrics_df, _ = rank_targets(directory=g2v_path, network=network, evaluation=evaluation) df = pd.DataFrame() df['auc'] = metrics_df['auc'] df['aps'] = metrics_df['aps'] df['eval'] = evaluation df['dge'] = dataset return df
def get_bel_results(dataset) -> (pd.DataFrame, pd.DataFrame): """""" dge_params = dge_params_ad if dataset in AD_DGE_DATASETS else dge_params_dis gene_list = parse_dge(dge_path=dge_file(dataset), entrez_id_header=dge_params['id'], log2_fold_change_header=dge_params['l2f'], adj_p_header=dge_params['adjp'], entrez_delimiter=split_char, base_mean_header=dge_params['mean'], csv_separator=';') network = generate_bel_network( bel_graph_path=bel_files, dge_list=gene_list, max_adj_p=max_padj, max_log2_fold_change=max_log2_fold_change, min_log2_fold_change=min_log2_fold_change, ) logger.info(f'Nodes {len(network.graph.nodes)}') targets = parse_gene_list( os.path.join(targets_base_path, dataset_to_disease_abv(dataset), 'ot_symbol.txt'), network) logger.debug( f'Number of targets being used for the network: {len(targets)}') write_gat2vec_input_files( network=network, targets=targets, home_dir=g2v_path, ) metrics_df, _ = rank_targets(directory=g2v_path, network=network, num_walks=30, walk_length=4, dimension=256, window_size=10) df = pd.DataFrame() df['auc'] = metrics_df['auc'] df['aps'] = metrics_df['aps'] df['ds'] = 'bel' df['dge'] = dataset return df
def get_ppi_results(ppi_graph_path: str, dataset: str) -> pd.DataFrame: dge_params = dge_params_ad if dataset in AD_DGE_DATASETS else dge_params_dis gene_list = parse_dge( dge_path=dge_file(dataset), entrez_id_header=dge_params['id'], log2_fold_change_header=dge_params['l2f'], adj_p_header=dge_params['adjp'], entrez_delimiter=split_char, base_mean_header=dge_params['mean'], ) network = generate_ppi_network( ppi_graph_path=ppi_graph_path, dge_list=gene_list, max_adj_p=max_padj, max_log2_fold_change=max_log2_fold_change, min_log2_fold_change=min_log2_fold_change, ppi_edge_min_confidence=ppi_edge_min_confidence, current_disease_ids_path='', ) logger.info(f'Nodes {len(network.graph.vs)}') logger.info(f'Edges {len(network.graph.es)}') targets = parse_gene_list(targets_file(dataset_to_disease_abv(dataset)), network) logger.debug( f'Number of targets being used for the network: {len(targets)}') write_gat2vec_input_files( network=network, targets=targets, home_dir=g2v_path, ) metrics_df, _ = rank_targets(directory=g2v_path, network=network) df = pd.DataFrame() df['auc'] = metrics_df['auc'] df['aps'] = metrics_df['aps'] df['ds'] = network_alias[os.path.basename(ppi_graph_path)] df['dge'] = dataset return df
def run(targets: List[str], ppi_graph_path: str, dge_json: List[Dict]): """Run the GuiltyTargets pipeline.""" start_time = time.time() assert os.path.exists(ppi_graph_path), f'ppi graph file does not exist: {ppi_graph_path}' dge_list = Gene.schema().load(dge_json, many=True) # stick the data in the temporary file # load the PPI network network = generate_ppi_network( ppi_graph_path=ppi_graph_path, dge_list=dge_list, max_adj_p=0.05, min_log2_fold_change=1.0, max_log2_fold_change=-1.0, ppi_edge_min_confidence=0.0, ) labeled_network = LabeledNetwork(network) attribute_network = AttributeNetwork(network) rv = { 'adjacency_list': network.get_adjlist(), 'attribute_adjacency_list': attribute_network.get_attribute_mappings(), 'label_mappings': labeled_network.get_index_labels(targets), } with tempfile.TemporaryDirectory() as output_directory: auc_df, probs_df = rank_targets( network=network, targets=targets, directory=output_directory, ) rv['auc'] = auc_df.to_json() rv['probs'] = probs_df.to_json() rv['time'] = time.time() - start_time return rv
def main(): for dge in ['BM10', 'BM22', 'BM36', 'BM44']: dge_path = dge_base_path % dge gene_list = parse_dge(dge_path=dge_path, entrez_id_header=entrez_id_name, log2_fold_change_header=log_fold_change_name, adj_p_header=adjusted_p_value_name, entrez_delimiter=split_char, base_mean_header=base_mean_name, csv_separator=';') dim = len(graph_paths) fig, axs = plt.subplots(ncols=dim, sharey='all', squeeze=False) fig.set_size_inches(10, 5) fig.suptitle(f'DGE {dge}') df = pd.DataFrame() axs_ind = 0 for ppi_graph_path in graph_paths: max_log2_fold_change, min_log2_fold_change = lfc_cutoff, lfc_cutoff * -1 network = generate_ppi_network( ppi_graph_path=os.path.join(ppi_base_path, ppi_graph_path), dge_list=gene_list, max_adj_p=max_padj, max_log2_fold_change=max_log2_fold_change, min_log2_fold_change=min_log2_fold_change, ppi_edge_min_confidence=ppi_edge_min_confidence, current_disease_ids_path='', disease_associations_path=phewas_path, ) targets = parse_gene_list(targets_file, network.graph) assoc_score = assoc_file and parse_association_scores(assoc_file) # File with no weights write_gat2vec_input_files(network=network, targets=targets, home_dir=g2v_path) auc_df, _ = rank_targets( directory=g2v_path, network=network, ) df['rr'] = auc_df['auc'] auc_df, _ = rank_targets(directory=g2v_path, network=network, evaluation='svm', class_weights='balanced') df['bsvm'] = auc_df['auc'] # File with weights write_gat2vec_input_files(network=network, targets=targets, home_dir=g2v_path, assoc_score=assoc_score) auc_df, _ = rank_targets( directory=g2v_path, network=network, ) df['wrr'] = auc_df['auc'] auc_df, _ = rank_targets(directory=g2v_path, network=network, evaluation='svm', class_weights='balanced') df['wbsvm'] = auc_df['auc'] df.boxplot(column=['rr', 'wrr', 'bsvm', 'wbsvm'], ax=axs[0][axs_ind]) axs[0][axs_ind].set_title(f'PPI {ppi_graph_path}"') axs_ind += 1 fig.savefig(f'comparison_humanbase({dge}).png')
def main(): for disease in DISEASE_ABBREVIATIONS: targets_file = targets_file_mask % disease assoc_file = assoc_file_mask % disease g2v_path = g2v_path_mask % disease dge_path = dge_path_mask % disease graph_paths = graphs_for_diseases[disease] gene_list = parse_dge(dge_path=dge_path, entrez_id_header=entrez_id_name, log2_fold_change_header=log_fold_change_name, adj_p_header=adjusted_p_value_name, entrez_delimiter=split_char, base_mean_header=base_mean_name, csv_separator=';') dim = len(graph_paths) fig, axs = plt.subplots(ncols=dim, sharey='all', squeeze=False) fig.set_size_inches(10, 5) fig.suptitle(f'DGE {disease}') df = pd.DataFrame() axs_ind = 0 for ppi_graph_path, ppi_edge_min_confidence in itertools.product( graph_paths, confidence_cutoffs): max_log2_fold_change, min_log2_fold_change = lfc_cutoff, lfc_cutoff * -1 network = generate_ppi_network( ppi_graph_path=os.path.join(base_ppi_path, ppi_graph_path), dge_list=gene_list, max_adj_p=max_padj, max_log2_fold_change=max_log2_fold_change, min_log2_fold_change=min_log2_fold_change, ppi_edge_min_confidence=ppi_edge_min_confidence, current_disease_ids_path='', disease_associations_path=phewas_path, ) targets = parse_gene_list(targets_file, network.graph) assoc_score = assoc_file and parse_association_scores(assoc_file) # File with no weights write_gat2vec_input_files(network=network, targets=targets, home_dir=g2v_path) auc_df, _ = rank_targets( directory=g2v_path, network=network, ) df['rr'] = auc_df['auc'] logger.debug(f'types for {disease}') logger.debug(type(df['rr'])) logger.debug(type(df)) auc_df, _ = rank_targets(directory=g2v_path, network=network, evaluation='svm', class_weights='balanced') df['bsvm'] = auc_df['auc'] logger.debug(type(df['bsvm'])) logger.debug(type(df)) # File with weights write_gat2vec_input_files(network=network, targets=targets, home_dir=g2v_path, assoc_score=assoc_score) auc_df, _ = rank_targets( directory=g2v_path, network=network, ) df['wrr'] = auc_df['auc'] logger.debug(type(df['wrr'])) logger.debug(type(df)) auc_df, _ = rank_targets(directory=g2v_path, network=network, evaluation='svm', class_weights='balanced') df['wbsvm'] = auc_df['auc'] logger.debug(type(df['wbsvm'])) logger.debug(type(df)) df.boxplot(column=['rr', 'wrr', 'bsvm', 'wbsvm'], ax=axs) axs[0][axs_ind].set_title( f'PPI {ppi_graph_path}", cutoff {ppi_edge_min_confidence}') axs_ind += 1 fig.savefig(f'comparison_humanbase-{disease}.png')
def optimize_g2v_parameters( ppi_graph_path: str, dataset: str, evaluation: str = 'cv', assoc_path: str = None ) -> pd.DataFrame: dge_params = dge_params_ad if dataset in AD_DGE_DATASETS else dge_params_dis gene_list = parse_dge( dge_path=dge_file(dataset), entrez_id_header=dge_params['id'], log2_fold_change_header=dge_params['l2f'], adj_p_header=dge_params['adjp'], entrez_delimiter=split_char, base_mean_header=dge_params['mean'], csv_separator=';' ) network = generate_ppi_network( ppi_graph_path=ppi_graph_path, dge_list=gene_list, max_adj_p=max_padj, max_log2_fold_change=max_log2_fold_change, min_log2_fold_change=min_log2_fold_change, ppi_edge_min_confidence=ppi_edge_min_confidence, current_disease_ids_path='', disease_associations_path=phewas_path, ) logger.info(f'Nodes {len(network.graph.vs)}') targets = parse_gene_list(targets_file(dataset_to_disease_abv(dataset)), network) assoc_score = assoc_path and parse_association_scores(assoc_path) logger.debug(f'Number of targets being used for the network: {len(targets)}') write_gat2vec_input_files( network=network, targets=targets, home_dir=g2v_path, assoc_score=assoc_score ) results = pd.DataFrame() for dim in [32, 64, 128, 256]: metrics_df, _ = rank_targets( directory=g2v_path, network=network, evaluation=evaluation, dimension=dim, ) df = pd.DataFrame() df['auc'] = metrics_df['auc'] df['aps'] = metrics_df['aps'] df['eval'] = str(dim) df['dge'] = dataset df['param'] = 'Dimension' logger.debug('df') logger.debug(df) results = results.append( assemble_results_df(metrics_df, dataset, 'Dimension', dim), ignore_index=True ) """num_walks: int = 10 walk_length: int = 80 dimension: int = 128 window_size: int = 5""" for nw in [6, 10, 20, 40, 80]: metrics_df, _ = rank_targets( directory=g2v_path, network=network, evaluation=evaluation, dimension=256, num_walks=nw, ) results = results.append( assemble_results_df(metrics_df, dataset, 'Num Walks', nw), ignore_index=True ) for wl in [20, 40, 80, 120, 160]: metrics_df, _ = rank_targets( directory=g2v_path, network=network, evaluation=evaluation, dimension=256, walk_length=wl ) results = results.append( assemble_results_df(metrics_df, dataset, 'Walk Length', wl), ignore_index=True ) for ws in [3, 5, 7, 10, 20, 40]: metrics_df, _ = rank_targets( directory=g2v_path, network=network, evaluation=evaluation, dimension=256, window_size=ws ) results = results.append( assemble_results_df(metrics_df, dataset, 'Window Size', ws), ignore_index=True ) logger.debug('results') logger.debug(results) return results