def compute_pairwise_information(cls, data, method, kwargs=None): logger = get_logger(__name__) if method == 'mutual-information': minet = importr('minet') pandas2ri.activate() if kwargs is None: kwargs = {} estimator = kwargs.pop('estimator', 'mi.shrink') disc = kwargs.pop('disc', 'equalwidth') nbins = kwargs.pop('nbins', np.sqrt(len(data.columns))) logger.debug('Running minet.build_mim(estimator={!r}, ' 'disc={!r}, nbins={!r})'.format( estimator, disc, nbins)) r_info = minet.build_mim(data.T, estimator=estimator, disc=disc, nbins=nbins, **kwargs) info = np.asarray(r_info) del r_info, minet gc.collect() pandas2ri.deactivate() else: raise ValueError( 'Unsupported information method: {!r}'.format(method)) info = pd.DataFrame(info, index=data.index, columns=data.index) return info
def gephi_forceatlas2_layout(network, desired_basename, outdir, **kwargs): """ Generates forceatlas2 layout using gephi. Uses `gephi-toolkit-forceatlas2-standalone` behind the scenes. See https://github.com/lukauskas/gephi-toolkit-forceatlas2-standalone Produces a gexf file and a pdf. :param network: network to layout :param desired_basename: basename to give to the gexf and pdf files :param outdir: output directory to store pdf and gexf :param kwargs: kwargs passed to gephi-toolkit-forceatlas2-standalone, see README :return: """ logger = get_logger('gephi_forceatlas2_layout') tmp_dir = tempfile.mkdtemp() try: tmp_file = os.path.join(tmp_dir, desired_basename + '.gexf') nx.write_gexf(network, tmp_file) logger.info('Gephi processing {}'.format(desired_basename)) _gephi_forceatlas2(tmp_file, outdir=outdir, **kwargs) logger.info('Gephi processing for {} done'.format(desired_basename)) finally: try: shutil.rmtree(tmp_dir) del tmp_dir except FileNotFoundError: pass
def load_matrix(min_significant=0): logger = get_logger(__name__) enrichment_data = pd.read_hdf(ENRICHMENT_DATASET, 'enrichment_data') n_significant = enrichment_data['significant'].fillna(False).groupby( level='Gene label').sum().astype(int) n_significant.name = 'n_significant' within_thr = (n_significant >= min_significant).sum() total = len(n_significant) logger.info('{:,}/{:,} genes ({:.2%}) ' 'have min_significant >= {}'.format(within_thr, total, within_thr / total, min_significant)) _n_significant = n_significant.reindex(enrichment_data.index, level=0) enrichment_data = enrichment_data[_n_significant >= min_significant] COLUMNS = [ 'Ratio H/L normalized (log2) (adjusted, imputed, forward)', 'Ratio H/L normalized (log2) (adjusted, imputed, reverse)' ] data = enrichment_data[COLUMNS].copy() # This is only to make results easier to interpret visually: data['Ratio H/L normalized (log2) (adjusted, imputed, reverse)'] *= -1.0 matrix = data.unstack('Pull-Down ID') n_nonzero = (matrix != 0).sum(axis=1) n_nonzero.name = 'n_nonzero' return remove_all_zero_rows(matrix), n_significant, n_nonzero
def evaluate(self, scores, ignore_na=False): logger = get_logger(__name__) true = self.interaction_metadata_['interaction_exists'] scores = self.remove_misleading_edges(scores) scores = scores.replace([np.inf, -np.inf], np.nan) if scores.isnull().any(): if scores.isnull().all(): raise ValueError('All scores are null') if not ignore_na: raise ValueError('Some scores are null or infinite') else: logger.warn( 'Some scores are NaN or infinity. Assuming they\'re lowest scores' ) scores = scores.fillna(scores.min() - 1e-6) if not scores.index.equals(true.index): raise Exception( 'Score index does not match true interactions index') return EvaluationResult(true, scores)
def remove_all_zero_rows(matrix): logger = get_logger(__name__) ans = matrix[(matrix != 0).any(axis=1)] logger.info('Removed {:,} ({:.2%}) rows ' 'due to them being all-zero'.format( len(matrix) - len(ans), (len(matrix) - len(ans)) / len(matrix))) return ans
def train_estimators(matrix): logger = get_logger(__name__) logger.info('Matrix shape provided to train_estimators: {}'.format( matrix.shape)) # Training estimators = deepcopy(ESTIMATORS) distances = _precompute_distances(matrix) estimators = _do_training(matrix, distances, estimators) return estimators
def merge_complexes_according_to_rules(full_complexes): logger = get_logger('curated_complexes.merge_complexes_according_to_rules') forced_merges = set() for complexes_to_merge in _FORCED_MERGES: for a, b in itertools.permutations(complexes_to_merge, 2): forced_merges.add((a, b)) seen_complexes = set() seen_complexes.update(_TO_REMOVE) full_complexes_merged = [] for complex_a, subdf_a in tqdm(full_complexes.groupby('Complex')): if complex_a in seen_complexes: continue seen_complexes.add(complex_a) to_merge = [subdf_a] for complex_b, subdf_b in full_complexes.groupby('Complex'): if complex_a == complex_b or complex_b in seen_complexes: continue equal = set(subdf_a['Gene label'].dropna()) == set( subdf_b['Gene label'].dropna()) if equal or (complex_a, complex_b) in forced_merges: seen_complexes.add(complex_b) to_merge.append(subdf_b) if len(to_merge) == 1: full_complexes_merged.append(to_merge[0]) else: _joint = pd.concat(to_merge, ignore_index=True) new_complex_name = '|'.join(sorted(_joint['Complex'].unique())) _grouped = merge_complex(_joint, new_name=new_complex_name) full_complexes_merged.append(_grouped) full_complexes_merged = pd.concat(full_complexes_merged, ignore_index=True) logger.info( 'Total number of complexes after merging EpiFactors and EBI: {:,}'. format(full_complexes_merged['Complex'].nunique())) return full_complexes_merged
def _do_training(matrix, distances, estimators): logger = get_logger(__name__) for label, estimator in tqdm(estimators.items(), desc='Training estimators'): with timed_segment('Training {}'.format(label), logger): if label.endswith('MI'): estimator.fit(distances['mutual-information']) else: estimator.fit(matrix) return estimators
def _precompute_distances(matrix): logger = get_logger(__name__) distances = {} for method in tqdm(['mutual-information'], desc='Precomputing distance matrices'): with timed_segment('Precomputing: {}'.format(method), logger): kwargs = None minet_method = method if method == 'mutual-information': kwargs = {'estimator': 'mi.mm'} distances[method] = MinetNetwork.compute_pairwise_information( matrix, method=minet_method, kwargs=kwargs) return distances
def pick_best_estimator(pretrained_estimators, standard, criterion, output_csv): logger = get_logger("pick_best_estimator") # First we need to evaluate the standards using ROC/PRC curves for each of the datasets. results = OrderedDict() for label, estimator in tqdm(pretrained_estimators.items(), desc='Evaluating estimators'): results[label] = standard.evaluate(estimator.adjacency_) results_summary = summarise_results(results, criterion=criterion) results_summary.to_csv(output_csv) best = results_summary.index[0] logger.info('Best estimator: {}'.format(best)) return best, results, results_summary
def process_complexes(): logger = get_logger('curated_complexes') bar = tqdm(total=3, desc='Compiling complexes list: EBI') ebi_complex_memberships = _load_ebi_complexes() bar.update() bar.set_description('Compiling complexes list: Epifactors') bar.update() bar.set_description('Compiling complexes list: Merging') epifactors_complex_memberships = _load_epifactors_complexes() manual_memberships, manual_removals = _load_manual_complexes() full_complexes = pd.concat( [ebi_complex_memberships, epifactors_complex_memberships], ignore_index=True) full_complexes = merge_complexes_according_to_rules(full_complexes) full_complexes['Complex'] = full_complexes['Complex'].map( lambda x: _RENAMING_MAP.get(x, x)) complete_complexes = merge_with_manual(full_complexes, manual_memberships, manual_removals) logger.info('Number of complexes (including manual): {:,}'.format( complete_complexes['Complex'].nunique())) complete_complexes = complete_complexes.sort_values( by=['Complex', 'Gene label']) complex_memberships_str = complete_complexes.groupby([ 'Gene label' ])['Complex'].apply(lambda x: ';'.join(sorted(x.dropna().unique()))) with pd.HDFStore(OUTPUT_FILE, 'w', complevel=9, complib='lzo') as store: store['curated_complexes'] = complete_complexes store['complex_memberships_str'] = complex_memberships_str complete_complexes.to_excel(OUTPUT_EXCEL_FILE, index=False) bar.update() bar.set_description('Compiling complexes list: Done')
def extract_ebi(): logger = get_logger('ebi_complexes') ebi_complexes = pd.read_csv(_EBI_INPUT_FILE, sep='\t') ebi_complexes = ebi_complexes.rename(columns={ '#Complex ac': 'Complex accession' }).set_index('Complex accession') ebi_memberships = ebi_complexes[ 'Identifiers (and stoichiometry) of molecules in complex'].str.split( '|', expand=True).stack() # Remove stoichiometry information ebi_memberships = ebi_memberships.str.replace('\(\d+\)', '') ebi_memberships.index.names = [ebi_memberships.index.names[0], 'Subunit'] ebi_memberships.name = 'UniProt ID' # Remove identifiers that have ':' in them, such as 'CHEBI:smth' ebi_memberships = ebi_memberships[~ebi_memberships.str.contains(':')] # Remove isoform-specific identifiers, such as "O00159-3" ebi_memberships = ebi_memberships.str.split('-').str[0] # Get only complexes that have more than one subunit more_than_one_subunit = ebi_memberships.groupby( level='Complex accession').size() > 1 more_than_one_subunit = more_than_one_subunit[more_than_one_subunit].index ebi_memberships = ebi_memberships.loc[list(more_than_one_subunit)] # Merge EBI complexes with our identifiers protein_id_map = get_protein_id_map() ebi_merged = pd.merge(ebi_memberships.reset_index(), protein_id_map, left_on='UniProt ID', right_on='Protein ID', how='left') # Add complex name to the output ebi_merged = ebi_merged.join(ebi_complexes['Recommended name'], on='Complex accession') return ebi_merged
def fetch_gene_meta(protein_id_map, sep=';'): logger = get_logger(__name__) unique_ids = protein_id_map['Protein ID'].unique() mg_response = query_mygene_from_cache(unique_ids) df = pd.merge(protein_id_map, mg_response, left_on='Protein ID', right_index=True) columns_to_merge = [ x for x in MYGENE_FIELDS if x not in ['alias', 'ensembl.protein', 'interpro'] ] long_form_metadata_fields = ['interpro'] new_df = [] new_ix = [] long_form_datasets = {} for protein_id_group, subdata in df.groupby(INDEX_COL): new_ix.append(protein_id_group) row = [] other_names = set() for aliases in subdata['alias'].dropna(): if isinstance(aliases, str): other_names.add(aliases) else: other_names.update(aliases) row.append(other_names) ensembl_protein_ids = set() for ensembl_data in subdata['ensembl'].dropna(): if isinstance(ensembl_data, dict): # sometimes they return a list of dicts, sometimes just a dict... ensembl_data = [ensembl_data] for ensembl_subdata in ensembl_data: ensembl_subdata = ensembl_subdata['protein'] if isinstance(ensembl_subdata, str): # Sometimes they return just a string.. ensembl_protein_ids.add(ensembl_subdata) else: ensembl_protein_ids.update(ensembl_subdata) row.append(ensembl_protein_ids) for col in columns_to_merge: series = subdata[col].dropna() if col == 'entrezgene': # Convert to int, then to str # this has to be done after NAs are removed though series = series.astype(int).astype(str) unique = series.unique() row.append(list(unique)) new_df.append(row) for col in long_form_metadata_fields: if col not in long_form_datasets: long_form_datasets[col] = [] dict_dataset = subdata[col].dropna() for dicts in dict_dataset.values: if isinstance(dicts, dict): dicts = [dicts] for d in dicts: d = pd.Series(d) d[INDEX_COL] = protein_id_group long_form_datasets[col].append(d) for col in long_form_datasets: long_form_datasets[col] = pd.DataFrame( long_form_datasets[col]).drop_duplicates() if col == 'interpro': long_form_datasets[col] = long_form_datasets[col].set_index( [INDEX_COL, 'id']) else: raise NotImplementedError( 'Specify index column for {!r} please'.format(col)) new_df = pd.DataFrame(new_df, columns=['alias'] + ['ensembl_protein_id'] + columns_to_merge, index=pd.Index(new_ix, name=INDEX_COL)) # sort aliases & ensembl protein ids new_df['alias'] = new_df['alias'].apply(lambda x: sep.join(sorted(x))) new_df['ensembl_protein_id'] = new_df['ensembl_protein_id'].apply( lambda x: sep.join(sorted(x))) def _join(x): try: return sep.join(x) except TypeError: logger.debug('Error joining {!r}'.format(x)) raise # Preserve order for other names for col in columns_to_merge: new_df[col] = new_df[col].apply(_join) return new_df, long_form_datasets, mg_response
def main(): if not os.path.isdir(OUTPUT_DIRECTORY): os.makedirs(OUTPUT_DIRECTORY) import random random.seed(RANDOM_STATE) np.random.seed(RANDOM_STATE) logger = get_logger(__name__) logger.info('Random state: {}'.format(RANDOM_STATE)) # Load dataset matrix, n_significant, n_nonzero = load_matrix( min_significant=MIN_SIGNIFICANT) # Train the estimators estimators = train_estimators(matrix) # Get standard for that we will use for evaluation standard = load_standard(matrix) nx.write_gexf(standard.to_network(), OUTPUT_STANDARD_NETWORK_FILE) # Decide which estimator is the best. best, training_results, training_results_summary = pick_best_estimator( estimators, standard, ESTIMATOR_SELECTION_CRITERION, OUTPUT_CSV_FILE) best_estimator = estimators[best] # Gather q-values from estimators edge_statistics, score_thesholds, evaluation_q_value = extract_edge_statistics_and_thresholds( best_estimator, standard) score_thesholds.to_csv(OUTPUT_SCORE_THRESHOLDS, sep='\t') # Some extra plots with timed_segment('Plotting ROC/PRC curves', logger=logger): plot_roc_prc(training_results_summary.index, best=best, thresholds=score_thesholds, colors=estimator_color_palette(), results=training_results, results_summary=training_results_summary) plot_score_breakdown_per_metadata(edge_statistics['score'], standard) # Plot the fit of the q-value function with timed_segment('Plotting score distributions', logger): plot_score_distributions(edge_statistics, best_estimator.nonzero_score_density_function, OUTPUT_SCORE_DISTRIBUTIONS_FILE) # Get subset for further analyses (n_nonzero >= threshold, and not-an-orphan) subset_for_networks = n_nonzero[ n_nonzero >= N_NONZERO_THRESHOLD_FOR_NETWORK_DRAWING].index network = evaluation_q_value.to_network(-np.log10(SELECTED_FDR_THRESHOLD), node_subset=subset_for_networks, add_unpredicted_edges=False, remove_orphan_nodes=True) drawable_subset = sorted(set(network.nodes)) orphans = sorted(set(subset_for_networks) - set(drawable_subset)) logger.info('{:,} nodes at FDR={} are orphans'.format( len(orphans), SELECTED_FDR_THRESHOLD)) with open(os.path.join(OUTPUT_DIRECTORY, 'orphan-nodes.txt'), 'w') as f: f.write('\n'.join(orphans)) _fmt = 'For graph visualisations generation we\'re only keeping nodes with ' \ 'n_nonzero >= {:,}. And non-orphan. This leaves {:,}/{:,} ({:.2%}) nodes' logger.info( _fmt.format(N_NONZERO_THRESHOLD_FOR_NETWORK_DRAWING, len(drawable_subset), len(n_nonzero), len(drawable_subset) / len(n_nonzero))) # For the web significant_edges = edge_statistics.query( 'q_value <= @SELECTED_FDR_THRESHOLD') with pd.HDFStore(OUTPUT_FILE, 'w', complevel=9, complib='lzo') as output_store: output_store['input/matrix'] = matrix output_store['input/n_significant'] = n_significant output_store['input/n_nonzero'] = n_nonzero output_store['input/random_state'] = pd.Series(RANDOM_STATE) output_store['statistics/best_estimator'] = pd.Series(best) output_store['output/edge_statistics'] = edge_statistics output_store['output/significant_edges'] = significant_edges output_store['output/score_thresholds'] = score_thesholds output_store['output/subset_for_networks'] = pd.Series( subset_for_networks) output_store['output/drawable_subset'] = pd.Series(drawable_subset)
def extract_edge_statistics_and_thresholds(best_estimator, standard): """ Converts estimator scores to q-values. Creates a conversion table between scores, q values, precision and recall. :param best_estimator: Trained estimator deemed to be the best. Has to support p_values func :param standard: Reference standard for evaluation :return: """ logger = get_logger("extract_edge_statistics_and_thresholds") adjacencies = best_estimator.adjacency_ p_values = best_estimator.p_values() correction_method = 'fdr_bh' logger.info( f'Converting metwork p values to q values using {correction_method}') # Convert p values into FDR q-values using 'fdr_bh': __, q_values, __, __ = multipletests(p_values, method=correction_method) q_values = pd.Series(q_values, index=p_values.index, name='q_value') edge_statistics = pd.DataFrame({ 'score': adjacencies, 'p_value': p_values, 'q_value': q_values }) edge_statistics = edge_statistics.join(standard.interaction_metadata_) # For q-value based curves we have to re-evaluate the standard using neg-log10-q edge_statistics['neg_log10_q'] = -np.log10(edge_statistics['q_value']) evaluation_q_value = standard.evaluate(edge_statistics['neg_log10_q']) prc_curve_thresholds = evaluation_q_value.prc_curve_thresholds high_confidence_threshold = find_threshold_for_precision_target( prc_curve_thresholds, precision_target=HIGH_CONFIDENCE_PRECISION_TARGET, plot_output_file=HIGH_CONFIDENCE_PLOT) thresholds = THRESHOLDS score_thresholds = [] for threshold_name in thresholds: if threshold_name == 'high-confidence': threshold = np.power(10, (-high_confidence_threshold)) else: threshold = float(threshold_name) neg_log10_threshold = -np.log10(threshold) sub = edge_statistics.query('q_value <= @threshold') sub_prc = prc_curve_thresholds.query( 'threshold >= @neg_log10_threshold') precision = sub_prc['precision'].min() recall = sub_prc['recall'].max() n_edges = len(sub) score = sub['score'].min() star = '* ' if threshold == SELECTED_FDR_THRESHOLD else '' logger.info( f'{star}q={threshold} -> score={score:.4f} edges={n_edges:,} precision={precision:,} recall={recall:,}' ) score_thresholds.append([ threshold_name, threshold, neg_log10_threshold, score, precision, recall, n_edges ]) score_thresholds = pd.DataFrame(score_thresholds, columns=[ 'threshold_name', 'threshold', 'neg_log10_threshold', 'score', 'precision', 'recall', 'n_edges' ]).set_index('threshold_name') return edge_statistics, score_thresholds, evaluation_q_value
def find_threshold_for_precision_target(prc_curve_thresholds, precision_target, plot_output_file=None): """ Finds the q-value threshold which corresponds to the specified precision target. :param prc_curve_thresholds: PRC curve thresholds file :param precision_target: precision target :param plot_output_file: (optional) file for illustration plot :return: """ logger = get_logger(__name__) target_threshold = prc_curve_thresholds.query( 'precision >= @precision_target')['threshold'].idxmin() target_row = prc_curve_thresholds.loc[target_threshold] logger.info('Target row: {}'.format(target_row)) logger.info('{:.2%} precision @ q={:.8e}'.format( target_row['precision'], 10**(-target_row['threshold']))) if plot_output_file is not None: from matplotlib import pyplot as plt with sns.plotting_context('paper'): fig = plt.figure(figsize=(3.5, 3)) ax = plt.gca() prc_curve_thresholds.plot(x='threshold', y='precision', label='Precision', color='#EF3F74', ax=ax) prc_curve_thresholds.plot(x='threshold', y='recall', color='#3969AC', label='Recall', ax=ax) ax.axhline(target_row['precision'], linestyle='--', color='k', label='') ax.axvline(target_row['threshold'], linestyle='--', color='k', label='') ax.annotate('{:.2%} precision @ q={:.2e}'.format( target_row['precision'], 10**(-target_row['threshold'])), xy=(target_row['threshold'], target_row['precision']), xytext=(target_row['threshold'] + 1, target_row['precision'] - 0.1)) ax.annotate('{:.2%} recall @ q={:.2e}'.format( target_row['recall'], 10**(-target_row['threshold'])), xy=(target_row['threshold'], target_row['recall']), xytext=(target_row['threshold'] + 1, target_row['recall'] + 0.02)) sns.despine(ax=ax, trim=True, offset=5) ax.set_xlabel('Threshold = -log10(q)') ax.set_ylabel('Precision/Recall') ax.legend(loc='upper right') plt.tight_layout() plt.savefig(plot_output_file, tight_layout=True, bbox_inches='tight') plt.close() return target_row['threshold']
def extract_epifactors(): logger = get_logger(__name__) genes = pd.read_csv(_EPIFACTORS_GENES).replace('#', np.nan) complexes = pd.read_csv(_EPIFACTORS_COMPLEXES).replace('#', np.nan) gene_meta = pd.read_hdf(_INPUT_FILE, 'gene_meta') complexes = complexes.rename( columns={ 'Group_name': 'Complex group', 'Complex_name': 'Complex name', 'Id': 'EpiFactors ID (Complex)', 'Group': 'EpiFactors ID (Complex group)', 'Alternative_name': 'Alternative name', 'UniProt_ID': 'UniProt entry', 'UniProt_AC': 'Protein ID', 'PMID_complex': 'PubMed ID (complex)', 'PMID_function': 'PubMed ID (function)', 'PMID_target': 'PubMed ID (target)', 'Specific_target': 'Specific target', 'Uniprot_ID_target': 'UniProt entry (target)', 'Comment': 'EpiFactors comment' }) genes = genes.rename( columns={ 'Id': 'EpiFactors ID (genes)', 'HGNC_symbol': 'HGNC symbol', 'Status': 'EpiFactors status', 'HGNC_ID': 'HGNC ID', 'HGNC_name': 'HGNC name', 'GeneID': 'Entrez ID', 'UniProt_AC': 'Protein ID', 'UniProt_ID': 'UniProt entry', 'MGI_symbol': 'MGI symbol', 'MGI_ID': 'MGI ID', 'UniProt_AC_Mm': 'Protein ID (mouse)', 'UniProt_ID_Mm': 'UniProt entry (mouse)', 'GeneTag': 'HGNC gene family tag', 'GeneDesc': 'HGNC gene family description', 'Complex_name': 'Complex names', 'Specific_target': 'Specific target', 'UniProt_ID_target': 'UniProt ID (target)', 'PMID_target': 'PubMed ID (target)', 'PMID_function': 'PubMed ID (function)' }) complexes = complexes.set_index(['Complex group', 'Complex name']) errorneous_data = [ # They list CERF twice, once with SWI/SNF, once with ISWI. The ISWI group has incorrect "Protein" annotation ('ISWI', 'CERF') ] complexes = complexes.loc[complexes.index.difference(errorneous_data)] # Assert that all complex names are unique assert len(complexes) == len( complexes.reset_index()['Complex name'].unique()) memberships = [] for ix, row in complexes.iterrows(): parsed = parse_complex_members(row['UniProt entry']) for key, val in zip(complexes.index.names, ix): parsed[key] = val memberships.append(parsed) memberships = pd.concat(memberships).drop_duplicates() memberships = pd.merge(memberships, genes[['UniProt entry', 'HGNC ID']], how='left', on='UniProt entry') # Map Gene heatmap_header_labels to HGNC IDs hgnc_id_map = gene_meta['HGNC'].str.split(';', expand=True).stack() hgnc_id_map.name = 'HGNC' hgnc_id_map = hgnc_id_map.reset_index() del hgnc_id_map['level_1'] hgnc_id_map = hgnc_id_map[hgnc_id_map['HGNC'] != ''] hgnc_id_map['HGNC'] = hgnc_id_map['HGNC'].astype(int, errors='raise') hgnc_id_map = hgnc_id_map.drop_duplicates() # Merge that with epifactors genes so now we have a link to our IDs genes_merged = pd.merge(genes, hgnc_id_map, left_on='HGNC ID', right_on='HGNC', how='left') logger.info('Epifactors: Successfully mapped {}/{} ({:.2%})'.format( (~genes_merged['Gene label'].isnull()).sum(), len(genes_merged), (~genes_merged['Gene label'].isnull()).sum() / len(genes_merged))) memberships_merged = pd.merge( memberships, genes_merged[['HGNC symbol', 'HGNC ID', 'Gene label']], on='HGNC ID', how='left') # See whether the complex is complete/partial or missing from data counts = memberships_merged.groupby(['Complex group', 'Complex name']).count() def _assign_type(row): if row['Gene label'] == 0: return 'missing' elif row['Gene label'] < row['HGNC ID']: return 'partial' else: return 'complete' complex_presence = counts.apply(_assign_type, axis=1) complex_presence.name = 'complex_presence' memberships_merged = memberships_merged.join( complex_presence, on=['Complex group', 'Complex name']) return memberships_merged
def _informative_nucleosome_graph(): logger = get_logger('informative_nucleosome_graph') predictors = pd.read_hdf(META_FILE, '/meta/predictors') network = nx.DiGraph() for pd_1, row_1 in predictors.iterrows(): n_ptms = row_1.sum() if n_ptms == 1: # This is a self-describing nucleosome. Throw that into the graph ptm = row_1[row_1].index[0] network.add_edge(SPECIAL_PULLDOWN, pd_1, predictor=ptm) elif n_ptms in PREDICTOR_EXCEPTIONS: exceptions = PREDICTOR_EXCEPTIONS[n_ptms] for preds, ptm in exceptions: non_preds = row_1.index.difference(preds) if row_1[preds].all() and ~(row_1[non_preds].any()): network.add_edge(SPECIAL_PULLDOWN, pd_1, predictor=ptm) break for (pd_1, row_1), (pd_2, row_2) in itertools.combinations(predictors.iterrows(), 2): # Skip the same if pd_2 == pd_1: continue diffs = row_1 != row_2 n_differences = diffs.sum() # This is self-describing if n_differences == 1: ptm = diffs[diffs].index[0] # Row has the extra PTM. Edge is pd_2 -> pd_1 if row_1[ptm]: network.add_edge(pd_2, pd_1, predictor=ptm) else: # row2 has the extra PTM, edge is pd_1 -> pd_2 network.add_edge(pd_1, pd_2, predictor=ptm) # These are exceptions elif n_differences in PREDICTOR_EXCEPTIONS: exceptions = PREDICTOR_EXCEPTIONS[n_differences] for preds, ptm in exceptions: non_preds = row_1.index.difference(preds) if row_1[preds].all() and (~row_2[preds]).all(): assert ~(diffs[non_preds].any()) network.add_edge(pd_2, pd_1, predictor=ptm) break elif (~row_1[preds]).all() and (row_2[preds]).all(): assert ~(diffs[non_preds].any()) network.add_edge(pd_1, pd_2, predictor=ptm) break network_df = [] for from_, to_, ptm in network.edges.data('predictor'): network_df.append([from_, to_, ptm]) network_df = pd.DataFrame(network_df, columns=['from_pd', 'to_pd', 'predictor']) logger.info('PTM predictive network generated: {:,} nodes, {:,} edges'.format( len(network.nodes), len(network.edges) )) non_informative_nucleosomes = [ix for ix in predictors.index if ix not in network.nodes] logger.info('Found {:,} non informative di-nucleosomes: {!r}'.format( len(non_informative_nucleosomes), non_informative_nucleosomes )) not_covered_predictors = [ix for ix in predictors.columns if ix not in network_df['predictor'].unique()] logger.info('Found {:,} not covered predictors: {!r}'.format( len(not_covered_predictors), not_covered_predictors )) predictor_counts = network_df['predictor'].value_counts() _text = [f'{k:>20}: {v:,} nucleosomes' for k, v in predictor_counts.iteritems()] _text = '\n'.join(_text) logger.info('The numbers of nucleosomes for each predictor are:\n{}'.format(_text)) # Assign edge names network_df['edge'] = network_df['to_pd'].str.cat(network_df['from_pd'], sep=EDGE_SEPARATOR) return network, network_df
def run(): logger = get_logger(__name__) # Load Training results with pd.HDFStore(TRAINING_OUTPUT, 'r') as store: matrix = store['input/matrix'] n_significant = store['input/n_significant'] n_nonzero = store['input/n_nonzero'] edge_statistics = store['output/edge_statistics'] score_thresholds = store['output/score_thresholds'] subset_for_networks = store['output/subset_for_networks'].values # For drawing it's easiest to recreate evaluation object # For q-value based curves we have to re-evaluate the standard using neg-log10-q standard = load_standard(matrix) evaluation_q_value = standard.evaluate(edge_statistics['neg_log10_q']) # Some extra data goes here complex_memberships_str = pd.read_hdf(CURATED_COMPLEXES_OUTPUT, 'complex_memberships_str') # Augment edge data with some metadata from us protein_meta = pd.read_hdf(CLEANUP_FILE, 'protein_meta') meta = protein_meta[[ 'Majority protein IDs', 'Gene names', 'Protein names' ]] # -- Let's do some network stuff now -- # Plot network, for now let's do a bunch of thresholds community_output = {} community_color_output = {} pos_output = {} with timed_segment('Writing networks', logger=logger): for threshold_name in score_thresholds.index: real_threshold = score_thresholds.loc[threshold_name, 'neg_log10_threshold'] # For HC network, add known edges. if threshold_name == 'high-confidence': add_unpredicted_edges = True else: add_unpredicted_edges = False network = evaluation_q_value.to_network( real_threshold, node_subset=subset_for_networks, add_unpredicted_edges=add_unpredicted_edges, remove_orphan_nodes=True) logger.info( 'Network for drawing q={}, nodes:{:,}, edges:{:,}'.format( threshold_name, len(network.nodes), len(network.edges))) # Annotate nodes with n_significant _n_significant_dict = dict(n_significant.loc[list(network.nodes)]) _n_significant_dict = { k: int(v) for k, v in _n_significant_dict.items() } nx.set_node_attributes(network, name='n_significant', values=_n_significant_dict) # Annotate nodes with n_nonzero _n_nonzero_dict = dict(n_nonzero.loc[list(network.nodes)]) _n_nonzero_dict = {k: int(v) for k, v in _n_nonzero_dict.items()} nx.set_node_attributes(network, name='n_nonzero', values=_n_nonzero_dict) # Annotate nodes with complex_memberships _complex_memberships_dict = { node: complex_memberships_str.get(node, '') for node in network.nodes } nx.set_node_attributes(network, name='complex_memberships', values=_complex_memberships_dict) # Reseed RNG -- for some reason output is non-deterministic. random.seed(RANDOM_STATE) np.random.seed(RANDOM_STATE) # Add colours if threshold_name != 'high-confidence': # Style network with communities communities = extract_communities(network) annotate_with_communities(network, communities) community_colors = colour_communities(network, communities) style_network_communities(network, community_colors) # Save stuff community_output[threshold_name] = communities_to_series( communities) community_colors = pd.Series(community_colors) community_colors.index.name = 'Community' community_colors.name = 'Color' community_color_output[threshold_name] = community_colors else: # For HC colour them blue colour_nodes(network, '#378cb9') style_network_edges(network) # Main network basename = NETWORK_FILES_BASENAME_TEMPLATE.format(threshold_name) if threshold_name not in MANUAL_POSITION_OVERRIDES: # Generate seed layout seed_kwargs = dict(gravity=40, scalingRatio=4) if threshold_name == 'high-confidence': seed_kwargs = dict(gravity=100, scalingRatio=15) seed_pos = fa2_pos(network, **seed_kwargs) # Add the position information style_position(network, seed_pos) if threshold_name == 'high-confidence': gephi_kwargs = dict(gravity=100, scale=15, duration=5, proportion=0.6, plot='false') else: gephi_kwargs = dict(gravity=40, scale=4, duration=15, proportion=0.6, plot='false') tmp_dir = tempfile.mkdtemp() try: gephi_forceatlas2_layout(network, basename, outdir=tmp_dir, **gephi_kwargs) # Now get the positions from gephi layout back: pos = parse_pos_from_1_3_gexf( os.path.join(tmp_dir, basename + '.gephi.gexf')) finally: try: shutil.rmtree(tmp_dir) del tmp_dir except FileNotFoundError: pass else: logger.info( f'Using manual override for {threshold_name} network') override_file = MANUAL_POSITION_OVERRIDES[threshold_name] if override_file.endswith('.gexf'): pos = parse_pos_from_1_3_gexf(override_file) elif override_file.endswith('.cyjs'): pos = parse_pos_from_cyjs(override_file) else: raise NotImplementedError( f'Cannot parse override file for {threshold_name}') for node in network.nodes: assert node in pos, f'Override network does not contain node {node!r}' # Update networkx layout to have it style_position(network, pos) # Draw the networkf if threshold_name == 'high-confidence': gephi_draw_kwargs = dict(duration=0, plot='true', rescaleedgeweight='true', minweight=2.0, maxweight=2.0, edgecolor="ORIGINAL", straight="true", nodeborderwidth=1.0) else: gephi_draw_kwargs = dict(duration=0, plot='true', rescaleedgeweight='true', minweight=2.0, maxweight=20.0, nodeborderwidth=0.0) gephi_forceatlas2_layout(network, basename, outdir=NETWORK_GRAPH_OUTPUT_DIRECTORY, **gephi_draw_kwargs) # Also make it in dataframe form, and attach it to node_meta pos_as_df = [] for ix, (x, y) in pos.items(): pos_as_df.append([ix, x, y]) pos_as_df = pd.DataFrame( pos_as_df, columns=[meta.index.name, 'network_pos_x', 'network_pos_y']).set_index(meta.index.name) pos_output[threshold_name] = pos_as_df # Let networkx write gexf for itself. At this point it doesn't support gexf v1.3 # so it won't be able to read the forceatlas2 output... write_gexf_compatible_with_cytoscape( network, os.path.join(NETWORK_GRAPH_OUTPUT_DIRECTORY, basename + NETWORKX_GEXF_SUFFIX)) main_communities = community_output[SELECTED_FDR_THRESHOLD] main_pos = pos_output[SELECTED_FDR_THRESHOLD] meta = meta.join(n_significant).join(n_nonzero).join(main_communities) meta = meta.join(main_pos) with pd.HDFStore(OUTPUT_HDF_FILE, 'w', complevel=9, complib='lzo') as output_store: output_store['input/random_state'] = pd.Series(RANDOM_STATE) output_store['output/edge_statistics'] = edge_statistics output_store['output/node_meta'] = meta for threshold in community_output: output_store['output/communities/{}/members'.format( threshold)] = community_output[threshold] output_store['output/communities/{}/colors'.format( threshold)] = community_color_output[threshold]
def extract_communities(graph, collapse_satellites=True): """ Extracts graph communities using `community` package. Automatically collapses satellite nodes to "satellite" community. :param graph: :param collapse_satellites: :return: """ logger = get_logger('extract_communities') communities = community.best_partition(graph) community_partition = {} for k, v in communities.items(): try: community_partition[v].append(k) except KeyError: community_partition[v] = [k] logger.info('Data partitioned into {:,} communities'.format(len(community_partition))) # Name communities based on protein with largest betweeness centrality community_names = {} for id_, members in community_partition.items(): subgraph = graph.subgraph(members) centralities = nx.algorithms.centrality.closeness_centrality(subgraph) # Sort by sorted_centralities = sorted(centralities.items(), # We want sort increasing by centrality # decreasing by key, thus -x[1] key=lambda x: (-x[1], x[0]), ) name = sorted_centralities[0][0] community_names[id_] = name community_members = {community_names[k]: frozenset(v) for k, v in community_partition.items()} if collapse_satellites: block = community_blocks(graph, community_members) degrees = dict(nx.degree(block)) satellites = set() MIN_NODES = 5 for node, degree in degrees.items(): if degree == 0 and len(community_members[node]) < MIN_NODES: satellites.add(node) no_satellite_communities = {k: v for k, v in community_members.items() if k not in satellites} satellite_members = set() for satellite in satellites: satellite_members.update(community_members[satellite]) no_satellite_communities['satellites'] = frozenset(satellite_members) community_members = no_satellite_communities return community_members
def main(): ensure_directory_exists(OUTPUT_FILE) logger = get_logger('models.ptm_response.main') long_matrices, network_df = longform_matrices_of_informative_nucleosomes() pulldown_predictors = pd.read_hdf(PULLDOWN_META, '/meta/predictors') with pd.HDFStore(OUTPUT_FILE, 'w') as store: joint_limma_stats = [] joint_camera_complexes = [] store[f'/ptm_stats/network_df'] = network_df for predictor, long_matrix in long_matrices.items(): store[f'/ptm_stats/{predictor}/long_matrix'] = long_matrix n_edges = long_matrix.reset_index()['edge'].nunique() if n_edges < MIN_N_EDGES: logger.info(f'Not analysing {predictor} as it has only {n_edges:,} supporting edges') continue matrix, design, weight = to_matrix_design_and_weights(long_matrix, min_unimputed=MIN_UNIMPUTED) store[f'/ptm_stats/{predictor}/matrix'] = matrix store[f'/ptm_stats/{predictor}/design'] = design store[f'/ptm_stats/{predictor}/weight'] = weight n = len(matrix) logger.info(f'Will analyse {n:,} proteins for {predictor}') coef = 'ptm' limma_result, __ = limma_fit(matrix, design, weight, coef, fdr_threshold=FDR_THRESHOLD_RESPONSE, fc_threshold=FC_THRESHOLD_RESPONSE) for key, value in limma_result.items(): store[f'/ptm_stats/{predictor}/limma/{key}'] = value limma_stats = limma_result['stats'] limma_stats['predictor'] = predictor joint_limma_stats.append(limma_stats.reset_index()) n_non_null = (~limma_stats['logFC'].isnull()).sum() non_null_ratio = n_non_null / n logger.info(f'{n_non_null:,}/{n:,} ({non_null_ratio:.2%}) analysed proteins are non null for {predictor}') n_significant = limma_stats['significant'].sum() n_significant_with_fc = limma_stats['significant_and_large_fc'].sum() n_significant_ratio = n_significant / n_non_null logger.info(f'Limma has found: {n_significant:,}/{n_non_null:,} ({n_significant_ratio:.2%}) ' f'proteins respond to {predictor} (FDR {FDR_THRESHOLD_RESPONSE}), out of which ' f'{n_significant_with_fc:,} have FC of at least {FC_THRESHOLD_RESPONSE}') ce = limma_camera_complexes(matrix, design, weight, limma_stats, coef=coef, min_size=ENRICHMENT_MIN_SIZE_COMPLEXES, max_size=ENRICHMENT_MAX_SIZE_COMPLEXES) ce['significant'] = ce['FDR'] <= ENRICHMENT_FDR_THRESHOLD ce['predictor'] = predictor store[f'/ptm_stats/{predictor}/camera/complexes'] = ce joint_camera_complexes.append(ce.reset_index()) # Main training done, now process dropouts dropouts = edges_for_predictor_dropouts(network_df, predictor, pulldown_predictors, min_edges=MIN_N_EDGES) dropout_list = pd.Series(list(dropouts.keys()), name=f'Dropouts for {predictor}') store[f'/ptm_stats/{predictor}/dropout/list'] = dropout_list for dropout, remaining_edges in dropouts.items(): submatrix = long_matrix.loc(axis=0)[:, remaining_edges] matrix, design, weight = to_matrix_design_and_weights(submatrix, min_unimputed=MIN_UNIMPUTED) store[f'/ptm_stats/{predictor}/dropout/{dropout}/matrix'] = matrix store[f'/ptm_stats/{predictor}/dropout/{dropout}/design'] = design store[f'/ptm_stats/{predictor}/dropout/{dropout}/weight'] = weight n = len(matrix) logger.info(f'Will analyse {n:,} proteins for {predictor} dropout {dropout}') limma_result, __ = limma_fit(matrix, design, weight, 'ptm', fdr_threshold=FDR_THRESHOLD_RESPONSE, fc_threshold=FC_THRESHOLD_RESPONSE) for key, value in limma_result.items(): store[f'/ptm_stats/{predictor}/dropout/{dropout}/limma/{key}'] = value joint_limma_stats = pd.concat(joint_limma_stats, ignore_index=True).set_index(['predictor', 'Gene label']) store[f'/ptm_stats/joint_limma_stats'] = joint_limma_stats joint_camera_complexes = pd.concat(joint_camera_complexes, ignore_index=True).set_index(['predictor', 'Complex']) store[f'/ptm_stats/joint_camera_complexes'] = joint_camera_complexes