Exemple #1
0
    def compute_pairwise_information(cls, data, method, kwargs=None):
        logger = get_logger(__name__)
        if method == 'mutual-information':
            minet = importr('minet')
            pandas2ri.activate()
            if kwargs is None:
                kwargs = {}

            estimator = kwargs.pop('estimator', 'mi.shrink')
            disc = kwargs.pop('disc', 'equalwidth')
            nbins = kwargs.pop('nbins', np.sqrt(len(data.columns)))

            logger.debug('Running minet.build_mim(estimator={!r}, '
                         'disc={!r}, nbins={!r})'.format(
                             estimator, disc, nbins))
            r_info = minet.build_mim(data.T,
                                     estimator=estimator,
                                     disc=disc,
                                     nbins=nbins,
                                     **kwargs)
            info = np.asarray(r_info)

            del r_info, minet
            gc.collect()
            pandas2ri.deactivate()
        else:
            raise ValueError(
                'Unsupported information method: {!r}'.format(method))

        info = pd.DataFrame(info, index=data.index, columns=data.index)

        return info
def gephi_forceatlas2_layout(network, desired_basename, outdir, **kwargs):
    """
    Generates forceatlas2 layout using gephi.

    Uses `gephi-toolkit-forceatlas2-standalone` behind the scenes.
    See https://github.com/lukauskas/gephi-toolkit-forceatlas2-standalone

    Produces a gexf file and a pdf.

    :param network: network to layout
    :param desired_basename: basename to give to the gexf and pdf files
    :param outdir: output directory to store pdf and gexf
    :param kwargs: kwargs passed to gephi-toolkit-forceatlas2-standalone, see README
    :return:
    """
    logger = get_logger('gephi_forceatlas2_layout')

    tmp_dir = tempfile.mkdtemp()
    try:
        tmp_file = os.path.join(tmp_dir, desired_basename + '.gexf')
        nx.write_gexf(network, tmp_file)

        logger.info('Gephi processing {}'.format(desired_basename))
        _gephi_forceatlas2(tmp_file, outdir=outdir, **kwargs)
        logger.info('Gephi processing for {} done'.format(desired_basename))
    finally:
        try:
            shutil.rmtree(tmp_dir)
            del tmp_dir
        except FileNotFoundError:
            pass
def load_matrix(min_significant=0):
    logger = get_logger(__name__)

    enrichment_data = pd.read_hdf(ENRICHMENT_DATASET, 'enrichment_data')

    n_significant = enrichment_data['significant'].fillna(False).groupby(
        level='Gene label').sum().astype(int)
    n_significant.name = 'n_significant'

    within_thr = (n_significant >= min_significant).sum()
    total = len(n_significant)

    logger.info('{:,}/{:,} genes ({:.2%}) '
                'have min_significant >= {}'.format(within_thr, total,
                                                    within_thr / total,
                                                    min_significant))

    _n_significant = n_significant.reindex(enrichment_data.index, level=0)
    enrichment_data = enrichment_data[_n_significant >= min_significant]

    COLUMNS = [
        'Ratio H/L normalized (log2) (adjusted, imputed, forward)',
        'Ratio H/L normalized (log2) (adjusted, imputed, reverse)'
    ]
    data = enrichment_data[COLUMNS].copy()
    # This is only to make results easier to interpret visually:
    data['Ratio H/L normalized (log2) (adjusted, imputed, reverse)'] *= -1.0
    matrix = data.unstack('Pull-Down ID')

    n_nonzero = (matrix != 0).sum(axis=1)
    n_nonzero.name = 'n_nonzero'

    return remove_all_zero_rows(matrix), n_significant, n_nonzero
Exemple #4
0
    def evaluate(self, scores, ignore_na=False):
        logger = get_logger(__name__)

        true = self.interaction_metadata_['interaction_exists']

        scores = self.remove_misleading_edges(scores)
        scores = scores.replace([np.inf, -np.inf], np.nan)

        if scores.isnull().any():
            if scores.isnull().all():
                raise ValueError('All scores are null')

            if not ignore_na:
                raise ValueError('Some scores are null or infinite')
            else:
                logger.warn(
                    'Some scores are NaN or infinity. Assuming they\'re lowest scores'
                )
                scores = scores.fillna(scores.min() - 1e-6)

        if not scores.index.equals(true.index):
            raise Exception(
                'Score index does not match true interactions index')

        return EvaluationResult(true, scores)
def remove_all_zero_rows(matrix):
    logger = get_logger(__name__)

    ans = matrix[(matrix != 0).any(axis=1)]

    logger.info('Removed {:,} ({:.2%}) rows '
                'due to them being all-zero'.format(
                    len(matrix) - len(ans),
                    (len(matrix) - len(ans)) / len(matrix)))

    return ans
def train_estimators(matrix):

    logger = get_logger(__name__)
    logger.info('Matrix shape provided to train_estimators: {}'.format(
        matrix.shape))

    # Training
    estimators = deepcopy(ESTIMATORS)
    distances = _precompute_distances(matrix)
    estimators = _do_training(matrix, distances, estimators)

    return estimators
def merge_complexes_according_to_rules(full_complexes):
    logger = get_logger('curated_complexes.merge_complexes_according_to_rules')

    forced_merges = set()

    for complexes_to_merge in _FORCED_MERGES:
        for a, b in itertools.permutations(complexes_to_merge, 2):
            forced_merges.add((a, b))

    seen_complexes = set()
    seen_complexes.update(_TO_REMOVE)

    full_complexes_merged = []
    for complex_a, subdf_a in tqdm(full_complexes.groupby('Complex')):
        if complex_a in seen_complexes:
            continue

        seen_complexes.add(complex_a)

        to_merge = [subdf_a]

        for complex_b, subdf_b in full_complexes.groupby('Complex'):

            if complex_a == complex_b or complex_b in seen_complexes:
                continue

            equal = set(subdf_a['Gene label'].dropna()) == set(
                subdf_b['Gene label'].dropna())

            if equal or (complex_a, complex_b) in forced_merges:
                seen_complexes.add(complex_b)
                to_merge.append(subdf_b)

        if len(to_merge) == 1:
            full_complexes_merged.append(to_merge[0])
        else:
            _joint = pd.concat(to_merge, ignore_index=True)

            new_complex_name = '|'.join(sorted(_joint['Complex'].unique()))
            _grouped = merge_complex(_joint, new_name=new_complex_name)

            full_complexes_merged.append(_grouped)

    full_complexes_merged = pd.concat(full_complexes_merged, ignore_index=True)

    logger.info(
        'Total number of complexes after merging EpiFactors and EBI: {:,}'.
        format(full_complexes_merged['Complex'].nunique()))

    return full_complexes_merged
def _do_training(matrix, distances, estimators):

    logger = get_logger(__name__)

    for label, estimator in tqdm(estimators.items(),
                                 desc='Training estimators'):

        with timed_segment('Training {}'.format(label), logger):
            if label.endswith('MI'):
                estimator.fit(distances['mutual-information'])
            else:
                estimator.fit(matrix)

    return estimators
def _precompute_distances(matrix):
    logger = get_logger(__name__)

    distances = {}
    for method in tqdm(['mutual-information'],
                       desc='Precomputing distance matrices'):

        with timed_segment('Precomputing: {}'.format(method), logger):
            kwargs = None
            minet_method = method

            if method == 'mutual-information':
                kwargs = {'estimator': 'mi.mm'}

            distances[method] = MinetNetwork.compute_pairwise_information(
                matrix, method=minet_method, kwargs=kwargs)

    return distances
def pick_best_estimator(pretrained_estimators, standard, criterion,
                        output_csv):

    logger = get_logger("pick_best_estimator")

    # First we need to evaluate the standards using ROC/PRC curves for each of the datasets.
    results = OrderedDict()
    for label, estimator in tqdm(pretrained_estimators.items(),
                                 desc='Evaluating estimators'):
        results[label] = standard.evaluate(estimator.adjacency_)

    results_summary = summarise_results(results, criterion=criterion)
    results_summary.to_csv(output_csv)

    best = results_summary.index[0]
    logger.info('Best estimator: {}'.format(best))

    return best, results, results_summary
def process_complexes():

    logger = get_logger('curated_complexes')

    bar = tqdm(total=3, desc='Compiling complexes list: EBI')
    ebi_complex_memberships = _load_ebi_complexes()

    bar.update()
    bar.set_description('Compiling complexes list: Epifactors')

    bar.update()
    bar.set_description('Compiling complexes list: Merging')

    epifactors_complex_memberships = _load_epifactors_complexes()
    manual_memberships, manual_removals = _load_manual_complexes()

    full_complexes = pd.concat(
        [ebi_complex_memberships, epifactors_complex_memberships],
        ignore_index=True)

    full_complexes = merge_complexes_according_to_rules(full_complexes)
    full_complexes['Complex'] = full_complexes['Complex'].map(
        lambda x: _RENAMING_MAP.get(x, x))

    complete_complexes = merge_with_manual(full_complexes, manual_memberships,
                                           manual_removals)
    logger.info('Number of complexes (including manual): {:,}'.format(
        complete_complexes['Complex'].nunique()))

    complete_complexes = complete_complexes.sort_values(
        by=['Complex', 'Gene label'])

    complex_memberships_str = complete_complexes.groupby([
        'Gene label'
    ])['Complex'].apply(lambda x: ';'.join(sorted(x.dropna().unique())))

    with pd.HDFStore(OUTPUT_FILE, 'w', complevel=9, complib='lzo') as store:
        store['curated_complexes'] = complete_complexes
        store['complex_memberships_str'] = complex_memberships_str

    complete_complexes.to_excel(OUTPUT_EXCEL_FILE, index=False)

    bar.update()
    bar.set_description('Compiling complexes list: Done')
Exemple #12
0
def extract_ebi():

    logger = get_logger('ebi_complexes')

    ebi_complexes = pd.read_csv(_EBI_INPUT_FILE, sep='\t')

    ebi_complexes = ebi_complexes.rename(columns={
        '#Complex ac': 'Complex accession'
    }).set_index('Complex accession')

    ebi_memberships = ebi_complexes[
        'Identifiers (and stoichiometry) of molecules in complex'].str.split(
            '|', expand=True).stack()
    # Remove stoichiometry information
    ebi_memberships = ebi_memberships.str.replace('\(\d+\)', '')
    ebi_memberships.index.names = [ebi_memberships.index.names[0], 'Subunit']
    ebi_memberships.name = 'UniProt ID'

    # Remove identifiers that have ':' in them, such as 'CHEBI:smth'
    ebi_memberships = ebi_memberships[~ebi_memberships.str.contains(':')]

    # Remove isoform-specific identifiers, such as "O00159-3"
    ebi_memberships = ebi_memberships.str.split('-').str[0]

    # Get only complexes that have more than one subunit
    more_than_one_subunit = ebi_memberships.groupby(
        level='Complex accession').size() > 1
    more_than_one_subunit = more_than_one_subunit[more_than_one_subunit].index

    ebi_memberships = ebi_memberships.loc[list(more_than_one_subunit)]

    # Merge EBI complexes with our identifiers
    protein_id_map = get_protein_id_map()
    ebi_merged = pd.merge(ebi_memberships.reset_index(),
                          protein_id_map,
                          left_on='UniProt ID',
                          right_on='Protein ID',
                          how='left')

    # Add complex name to the output
    ebi_merged = ebi_merged.join(ebi_complexes['Recommended name'],
                                 on='Complex accession')

    return ebi_merged
Exemple #13
0
def fetch_gene_meta(protein_id_map, sep=';'):
    logger = get_logger(__name__)

    unique_ids = protein_id_map['Protein ID'].unique()
    mg_response = query_mygene_from_cache(unique_ids)

    df = pd.merge(protein_id_map,
                  mg_response,
                  left_on='Protein ID',
                  right_index=True)

    columns_to_merge = [
        x for x in MYGENE_FIELDS
        if x not in ['alias', 'ensembl.protein', 'interpro']
    ]

    long_form_metadata_fields = ['interpro']

    new_df = []
    new_ix = []

    long_form_datasets = {}

    for protein_id_group, subdata in df.groupby(INDEX_COL):
        new_ix.append(protein_id_group)
        row = []

        other_names = set()
        for aliases in subdata['alias'].dropna():
            if isinstance(aliases, str):
                other_names.add(aliases)
            else:
                other_names.update(aliases)

        row.append(other_names)

        ensembl_protein_ids = set()
        for ensembl_data in subdata['ensembl'].dropna():
            if isinstance(ensembl_data, dict):
                # sometimes they return a list of dicts, sometimes just a dict...
                ensembl_data = [ensembl_data]

            for ensembl_subdata in ensembl_data:
                ensembl_subdata = ensembl_subdata['protein']
                if isinstance(ensembl_subdata, str):
                    # Sometimes they return just a string..
                    ensembl_protein_ids.add(ensembl_subdata)
                else:
                    ensembl_protein_ids.update(ensembl_subdata)

        row.append(ensembl_protein_ids)

        for col in columns_to_merge:
            series = subdata[col].dropna()
            if col == 'entrezgene':
                # Convert to int, then to str
                # this has to be done after NAs are removed though
                series = series.astype(int).astype(str)

            unique = series.unique()
            row.append(list(unique))

        new_df.append(row)

        for col in long_form_metadata_fields:
            if col not in long_form_datasets:
                long_form_datasets[col] = []

            dict_dataset = subdata[col].dropna()
            for dicts in dict_dataset.values:
                if isinstance(dicts, dict):
                    dicts = [dicts]

                for d in dicts:
                    d = pd.Series(d)
                    d[INDEX_COL] = protein_id_group
                    long_form_datasets[col].append(d)

    for col in long_form_datasets:
        long_form_datasets[col] = pd.DataFrame(
            long_form_datasets[col]).drop_duplicates()

        if col == 'interpro':
            long_form_datasets[col] = long_form_datasets[col].set_index(
                [INDEX_COL, 'id'])
        else:
            raise NotImplementedError(
                'Specify index column for {!r} please'.format(col))

    new_df = pd.DataFrame(new_df,
                          columns=['alias'] + ['ensembl_protein_id'] +
                          columns_to_merge,
                          index=pd.Index(new_ix, name=INDEX_COL))
    # sort aliases & ensembl protein ids
    new_df['alias'] = new_df['alias'].apply(lambda x: sep.join(sorted(x)))
    new_df['ensembl_protein_id'] = new_df['ensembl_protein_id'].apply(
        lambda x: sep.join(sorted(x)))

    def _join(x):
        try:
            return sep.join(x)
        except TypeError:
            logger.debug('Error joining {!r}'.format(x))
            raise

    # Preserve order for other names
    for col in columns_to_merge:
        new_df[col] = new_df[col].apply(_join)

    return new_df, long_form_datasets, mg_response
def main():

    if not os.path.isdir(OUTPUT_DIRECTORY):
        os.makedirs(OUTPUT_DIRECTORY)

    import random
    random.seed(RANDOM_STATE)
    np.random.seed(RANDOM_STATE)

    logger = get_logger(__name__)
    logger.info('Random state: {}'.format(RANDOM_STATE))

    # Load dataset
    matrix, n_significant, n_nonzero = load_matrix(
        min_significant=MIN_SIGNIFICANT)

    # Train the estimators
    estimators = train_estimators(matrix)

    # Get standard for that we will use for evaluation
    standard = load_standard(matrix)
    nx.write_gexf(standard.to_network(), OUTPUT_STANDARD_NETWORK_FILE)

    # Decide which estimator is the best.
    best, training_results, training_results_summary = pick_best_estimator(
        estimators, standard, ESTIMATOR_SELECTION_CRITERION, OUTPUT_CSV_FILE)

    best_estimator = estimators[best]

    # Gather q-values from estimators
    edge_statistics, score_thesholds, evaluation_q_value = extract_edge_statistics_and_thresholds(
        best_estimator, standard)
    score_thesholds.to_csv(OUTPUT_SCORE_THRESHOLDS, sep='\t')

    # Some extra plots
    with timed_segment('Plotting ROC/PRC curves', logger=logger):
        plot_roc_prc(training_results_summary.index,
                     best=best,
                     thresholds=score_thesholds,
                     colors=estimator_color_palette(),
                     results=training_results,
                     results_summary=training_results_summary)

    plot_score_breakdown_per_metadata(edge_statistics['score'], standard)

    # Plot the fit of the q-value function
    with timed_segment('Plotting score distributions', logger):
        plot_score_distributions(edge_statistics,
                                 best_estimator.nonzero_score_density_function,
                                 OUTPUT_SCORE_DISTRIBUTIONS_FILE)

    # Get subset for further analyses (n_nonzero >= threshold, and not-an-orphan)
    subset_for_networks = n_nonzero[
        n_nonzero >= N_NONZERO_THRESHOLD_FOR_NETWORK_DRAWING].index
    network = evaluation_q_value.to_network(-np.log10(SELECTED_FDR_THRESHOLD),
                                            node_subset=subset_for_networks,
                                            add_unpredicted_edges=False,
                                            remove_orphan_nodes=True)
    drawable_subset = sorted(set(network.nodes))

    orphans = sorted(set(subset_for_networks) - set(drawable_subset))
    logger.info('{:,} nodes at FDR={} are orphans'.format(
        len(orphans), SELECTED_FDR_THRESHOLD))
    with open(os.path.join(OUTPUT_DIRECTORY, 'orphan-nodes.txt'), 'w') as f:
        f.write('\n'.join(orphans))

    _fmt = 'For graph visualisations generation we\'re only keeping nodes with ' \
           'n_nonzero >= {:,}. And non-orphan. This leaves {:,}/{:,} ({:.2%}) nodes'
    logger.info(
        _fmt.format(N_NONZERO_THRESHOLD_FOR_NETWORK_DRAWING,
                    len(drawable_subset), len(n_nonzero),
                    len(drawable_subset) / len(n_nonzero)))
    # For the web
    significant_edges = edge_statistics.query(
        'q_value <= @SELECTED_FDR_THRESHOLD')
    with pd.HDFStore(OUTPUT_FILE, 'w', complevel=9,
                     complib='lzo') as output_store:

        output_store['input/matrix'] = matrix
        output_store['input/n_significant'] = n_significant
        output_store['input/n_nonzero'] = n_nonzero

        output_store['input/random_state'] = pd.Series(RANDOM_STATE)

        output_store['statistics/best_estimator'] = pd.Series(best)

        output_store['output/edge_statistics'] = edge_statistics
        output_store['output/significant_edges'] = significant_edges

        output_store['output/score_thresholds'] = score_thesholds
        output_store['output/subset_for_networks'] = pd.Series(
            subset_for_networks)
        output_store['output/drawable_subset'] = pd.Series(drawable_subset)
def extract_edge_statistics_and_thresholds(best_estimator, standard):
    """
    Converts estimator scores to q-values.
    Creates a conversion table between scores, q values, precision and recall.

    :param best_estimator: Trained estimator deemed to be the best. Has to support p_values func
    :param standard: Reference standard for evaluation
    :return:
    """
    logger = get_logger("extract_edge_statistics_and_thresholds")

    adjacencies = best_estimator.adjacency_
    p_values = best_estimator.p_values()

    correction_method = 'fdr_bh'
    logger.info(
        f'Converting metwork p values to q values using {correction_method}')

    # Convert p values into FDR q-values using 'fdr_bh':
    __, q_values, __, __ = multipletests(p_values, method=correction_method)
    q_values = pd.Series(q_values, index=p_values.index, name='q_value')

    edge_statistics = pd.DataFrame({
        'score': adjacencies,
        'p_value': p_values,
        'q_value': q_values
    })

    edge_statistics = edge_statistics.join(standard.interaction_metadata_)

    # For q-value based curves we have to re-evaluate the standard using neg-log10-q
    edge_statistics['neg_log10_q'] = -np.log10(edge_statistics['q_value'])
    evaluation_q_value = standard.evaluate(edge_statistics['neg_log10_q'])

    prc_curve_thresholds = evaluation_q_value.prc_curve_thresholds
    high_confidence_threshold = find_threshold_for_precision_target(
        prc_curve_thresholds,
        precision_target=HIGH_CONFIDENCE_PRECISION_TARGET,
        plot_output_file=HIGH_CONFIDENCE_PLOT)

    thresholds = THRESHOLDS

    score_thresholds = []
    for threshold_name in thresholds:
        if threshold_name == 'high-confidence':
            threshold = np.power(10, (-high_confidence_threshold))
        else:
            threshold = float(threshold_name)

        neg_log10_threshold = -np.log10(threshold)

        sub = edge_statistics.query('q_value <= @threshold')
        sub_prc = prc_curve_thresholds.query(
            'threshold >= @neg_log10_threshold')

        precision = sub_prc['precision'].min()
        recall = sub_prc['recall'].max()

        n_edges = len(sub)
        score = sub['score'].min()

        star = '* ' if threshold == SELECTED_FDR_THRESHOLD else ''

        logger.info(
            f'{star}q={threshold} -> score={score:.4f} edges={n_edges:,} precision={precision:,} recall={recall:,}'
        )

        score_thresholds.append([
            threshold_name, threshold, neg_log10_threshold, score, precision,
            recall, n_edges
        ])

    score_thresholds = pd.DataFrame(score_thresholds,
                                    columns=[
                                        'threshold_name', 'threshold',
                                        'neg_log10_threshold', 'score',
                                        'precision', 'recall', 'n_edges'
                                    ]).set_index('threshold_name')

    return edge_statistics, score_thresholds, evaluation_q_value
def find_threshold_for_precision_target(prc_curve_thresholds,
                                        precision_target,
                                        plot_output_file=None):
    """
    Finds the q-value threshold which corresponds to the specified precision target.

    :param prc_curve_thresholds: PRC curve thresholds file
    :param precision_target: precision target
    :param plot_output_file: (optional) file for illustration plot
    :return:
    """

    logger = get_logger(__name__)

    target_threshold = prc_curve_thresholds.query(
        'precision >= @precision_target')['threshold'].idxmin()
    target_row = prc_curve_thresholds.loc[target_threshold]

    logger.info('Target row: {}'.format(target_row))
    logger.info('{:.2%} precision @ q={:.8e}'.format(
        target_row['precision'], 10**(-target_row['threshold'])))

    if plot_output_file is not None:
        from matplotlib import pyplot as plt
        with sns.plotting_context('paper'):
            fig = plt.figure(figsize=(3.5, 3))
            ax = plt.gca()
            prc_curve_thresholds.plot(x='threshold',
                                      y='precision',
                                      label='Precision',
                                      color='#EF3F74',
                                      ax=ax)
            prc_curve_thresholds.plot(x='threshold',
                                      y='recall',
                                      color='#3969AC',
                                      label='Recall',
                                      ax=ax)

            ax.axhline(target_row['precision'],
                       linestyle='--',
                       color='k',
                       label='')
            ax.axvline(target_row['threshold'],
                       linestyle='--',
                       color='k',
                       label='')

            ax.annotate('{:.2%} precision @ q={:.2e}'.format(
                target_row['precision'], 10**(-target_row['threshold'])),
                        xy=(target_row['threshold'], target_row['precision']),
                        xytext=(target_row['threshold'] + 1,
                                target_row['precision'] - 0.1))

            ax.annotate('{:.2%} recall @ q={:.2e}'.format(
                target_row['recall'], 10**(-target_row['threshold'])),
                        xy=(target_row['threshold'], target_row['recall']),
                        xytext=(target_row['threshold'] + 1,
                                target_row['recall'] + 0.02))

            sns.despine(ax=ax, trim=True, offset=5)

            ax.set_xlabel('Threshold = -log10(q)')
            ax.set_ylabel('Precision/Recall')
            ax.legend(loc='upper right')

            plt.tight_layout()
            plt.savefig(plot_output_file,
                        tight_layout=True,
                        bbox_inches='tight')
            plt.close()

    return target_row['threshold']
Exemple #17
0
def extract_epifactors():

    logger = get_logger(__name__)

    genes = pd.read_csv(_EPIFACTORS_GENES).replace('#', np.nan)
    complexes = pd.read_csv(_EPIFACTORS_COMPLEXES).replace('#', np.nan)

    gene_meta = pd.read_hdf(_INPUT_FILE, 'gene_meta')

    complexes = complexes.rename(
        columns={
            'Group_name': 'Complex group',
            'Complex_name': 'Complex name',
            'Id': 'EpiFactors ID (Complex)',
            'Group': 'EpiFactors ID (Complex group)',
            'Alternative_name': 'Alternative name',
            'UniProt_ID': 'UniProt entry',
            'UniProt_AC': 'Protein ID',
            'PMID_complex': 'PubMed ID (complex)',
            'PMID_function': 'PubMed ID (function)',
            'PMID_target': 'PubMed ID (target)',
            'Specific_target': 'Specific target',
            'Uniprot_ID_target': 'UniProt entry (target)',
            'Comment': 'EpiFactors comment'
        })

    genes = genes.rename(
        columns={
            'Id': 'EpiFactors ID (genes)',
            'HGNC_symbol': 'HGNC symbol',
            'Status': 'EpiFactors status',
            'HGNC_ID': 'HGNC ID',
            'HGNC_name': 'HGNC name',
            'GeneID': 'Entrez ID',
            'UniProt_AC': 'Protein ID',
            'UniProt_ID': 'UniProt entry',
            'MGI_symbol': 'MGI symbol',
            'MGI_ID': 'MGI ID',
            'UniProt_AC_Mm': 'Protein ID (mouse)',
            'UniProt_ID_Mm': 'UniProt entry (mouse)',
            'GeneTag': 'HGNC gene family tag',
            'GeneDesc': 'HGNC gene family description',
            'Complex_name': 'Complex names',
            'Specific_target': 'Specific target',
            'UniProt_ID_target': 'UniProt ID (target)',
            'PMID_target': 'PubMed ID (target)',
            'PMID_function': 'PubMed ID (function)'
        })

    complexes = complexes.set_index(['Complex group', 'Complex name'])
    errorneous_data = [
        # They list CERF twice, once with SWI/SNF, once with ISWI. The ISWI group has incorrect "Protein" annotation
        ('ISWI', 'CERF')
    ]
    complexes = complexes.loc[complexes.index.difference(errorneous_data)]
    # Assert that all complex names are unique
    assert len(complexes) == len(
        complexes.reset_index()['Complex name'].unique())

    memberships = []

    for ix, row in complexes.iterrows():
        parsed = parse_complex_members(row['UniProt entry'])

        for key, val in zip(complexes.index.names, ix):
            parsed[key] = val

        memberships.append(parsed)

    memberships = pd.concat(memberships).drop_duplicates()
    memberships = pd.merge(memberships,
                           genes[['UniProt entry', 'HGNC ID']],
                           how='left',
                           on='UniProt entry')

    # Map Gene heatmap_header_labels to HGNC IDs
    hgnc_id_map = gene_meta['HGNC'].str.split(';', expand=True).stack()
    hgnc_id_map.name = 'HGNC'
    hgnc_id_map = hgnc_id_map.reset_index()
    del hgnc_id_map['level_1']
    hgnc_id_map = hgnc_id_map[hgnc_id_map['HGNC'] != '']
    hgnc_id_map['HGNC'] = hgnc_id_map['HGNC'].astype(int, errors='raise')
    hgnc_id_map = hgnc_id_map.drop_duplicates()

    # Merge that with epifactors genes so now we have a link to our IDs
    genes_merged = pd.merge(genes,
                            hgnc_id_map,
                            left_on='HGNC ID',
                            right_on='HGNC',
                            how='left')

    logger.info('Epifactors: Successfully mapped {}/{} ({:.2%})'.format(
        (~genes_merged['Gene label'].isnull()).sum(), len(genes_merged),
        (~genes_merged['Gene label'].isnull()).sum() / len(genes_merged)))

    memberships_merged = pd.merge(
        memberships,
        genes_merged[['HGNC symbol', 'HGNC ID', 'Gene label']],
        on='HGNC ID',
        how='left')

    # See whether the complex is complete/partial or missing from data
    counts = memberships_merged.groupby(['Complex group',
                                         'Complex name']).count()

    def _assign_type(row):
        if row['Gene label'] == 0:
            return 'missing'
        elif row['Gene label'] < row['HGNC ID']:
            return 'partial'
        else:
            return 'complete'

    complex_presence = counts.apply(_assign_type, axis=1)
    complex_presence.name = 'complex_presence'

    memberships_merged = memberships_merged.join(
        complex_presence, on=['Complex group', 'Complex name'])

    return memberships_merged
def _informative_nucleosome_graph():
    logger = get_logger('informative_nucleosome_graph')
    predictors = pd.read_hdf(META_FILE, '/meta/predictors')

    network = nx.DiGraph()
    for pd_1, row_1 in predictors.iterrows():
        n_ptms = row_1.sum()
        if n_ptms == 1:
            # This is a self-describing nucleosome. Throw that into the graph
            ptm = row_1[row_1].index[0]
            network.add_edge(SPECIAL_PULLDOWN, pd_1, predictor=ptm)
        elif n_ptms in PREDICTOR_EXCEPTIONS:
            exceptions = PREDICTOR_EXCEPTIONS[n_ptms]
            for preds, ptm in exceptions:
                non_preds = row_1.index.difference(preds)
                if row_1[preds].all() and ~(row_1[non_preds].any()):
                    network.add_edge(SPECIAL_PULLDOWN, pd_1, predictor=ptm)
                    break

    for (pd_1, row_1), (pd_2, row_2) in itertools.combinations(predictors.iterrows(), 2):
        # Skip the same
        if pd_2 == pd_1:
            continue

        diffs = row_1 != row_2
        n_differences = diffs.sum()

        # This is self-describing
        if n_differences == 1:
            ptm = diffs[diffs].index[0]

            # Row has the extra PTM. Edge is pd_2 -> pd_1
            if row_1[ptm]:
                network.add_edge(pd_2, pd_1, predictor=ptm)
            else:
                # row2 has the extra PTM, edge is pd_1 -> pd_2
                network.add_edge(pd_1, pd_2, predictor=ptm)
        # These are exceptions
        elif n_differences in PREDICTOR_EXCEPTIONS:
            exceptions = PREDICTOR_EXCEPTIONS[n_differences]
            for preds, ptm in exceptions:
                non_preds = row_1.index.difference(preds)

                if row_1[preds].all() and (~row_2[preds]).all():
                    assert ~(diffs[non_preds].any())
                    network.add_edge(pd_2, pd_1, predictor=ptm)
                    break
                elif (~row_1[preds]).all() and (row_2[preds]).all():
                    assert ~(diffs[non_preds].any())
                    network.add_edge(pd_1, pd_2, predictor=ptm)
                    break

    network_df = []

    for from_, to_, ptm in network.edges.data('predictor'):
        network_df.append([from_, to_, ptm])
    network_df = pd.DataFrame(network_df, columns=['from_pd', 'to_pd', 'predictor'])

    logger.info('PTM predictive network generated: {:,} nodes, {:,} edges'.format(
        len(network.nodes), len(network.edges)
    ))

    non_informative_nucleosomes = [ix for ix in predictors.index if ix not in network.nodes]
    logger.info('Found {:,} non informative di-nucleosomes: {!r}'.format(
        len(non_informative_nucleosomes),
        non_informative_nucleosomes
    ))

    not_covered_predictors = [ix for ix in predictors.columns if ix not in network_df['predictor'].unique()]
    logger.info('Found {:,} not covered predictors: {!r}'.format(
        len(not_covered_predictors),
        not_covered_predictors
    ))

    predictor_counts = network_df['predictor'].value_counts()
    _text = [f'{k:>20}: {v:,} nucleosomes' for k, v in predictor_counts.iteritems()]
    _text = '\n'.join(_text)
    logger.info('The numbers of nucleosomes for each predictor are:\n{}'.format(_text))

    # Assign edge names
    network_df['edge'] = network_df['to_pd'].str.cat(network_df['from_pd'], sep=EDGE_SEPARATOR)

    return network, network_df
Exemple #19
0
def run():
    logger = get_logger(__name__)
    # Load Training results
    with pd.HDFStore(TRAINING_OUTPUT, 'r') as store:
        matrix = store['input/matrix']
        n_significant = store['input/n_significant']
        n_nonzero = store['input/n_nonzero']

        edge_statistics = store['output/edge_statistics']
        score_thresholds = store['output/score_thresholds']
        subset_for_networks = store['output/subset_for_networks'].values

    # For drawing it's easiest to recreate evaluation object
    # For q-value based curves we have to re-evaluate the standard using neg-log10-q
    standard = load_standard(matrix)
    evaluation_q_value = standard.evaluate(edge_statistics['neg_log10_q'])

    # Some extra data goes here
    complex_memberships_str = pd.read_hdf(CURATED_COMPLEXES_OUTPUT,
                                          'complex_memberships_str')

    # Augment edge data with some metadata from us
    protein_meta = pd.read_hdf(CLEANUP_FILE, 'protein_meta')
    meta = protein_meta[[
        'Majority protein IDs', 'Gene names', 'Protein names'
    ]]

    # -- Let's do some network stuff now --
    # Plot network, for now let's do a bunch of thresholds
    community_output = {}
    community_color_output = {}

    pos_output = {}

    with timed_segment('Writing networks', logger=logger):
        for threshold_name in score_thresholds.index:
            real_threshold = score_thresholds.loc[threshold_name,
                                                  'neg_log10_threshold']

            # For HC network, add known edges.
            if threshold_name == 'high-confidence':
                add_unpredicted_edges = True
            else:
                add_unpredicted_edges = False

            network = evaluation_q_value.to_network(
                real_threshold,
                node_subset=subset_for_networks,
                add_unpredicted_edges=add_unpredicted_edges,
                remove_orphan_nodes=True)

            logger.info(
                'Network for drawing q={}, nodes:{:,}, edges:{:,}'.format(
                    threshold_name, len(network.nodes), len(network.edges)))

            # Annotate nodes with n_significant
            _n_significant_dict = dict(n_significant.loc[list(network.nodes)])
            _n_significant_dict = {
                k: int(v)
                for k, v in _n_significant_dict.items()
            }
            nx.set_node_attributes(network,
                                   name='n_significant',
                                   values=_n_significant_dict)

            # Annotate nodes with n_nonzero
            _n_nonzero_dict = dict(n_nonzero.loc[list(network.nodes)])
            _n_nonzero_dict = {k: int(v) for k, v in _n_nonzero_dict.items()}
            nx.set_node_attributes(network,
                                   name='n_nonzero',
                                   values=_n_nonzero_dict)

            # Annotate nodes with complex_memberships
            _complex_memberships_dict = {
                node: complex_memberships_str.get(node, '')
                for node in network.nodes
            }
            nx.set_node_attributes(network,
                                   name='complex_memberships',
                                   values=_complex_memberships_dict)

            # Reseed RNG -- for some reason output is non-deterministic.
            random.seed(RANDOM_STATE)
            np.random.seed(RANDOM_STATE)

            # Add colours
            if threshold_name != 'high-confidence':
                # Style network with communities
                communities = extract_communities(network)
                annotate_with_communities(network, communities)
                community_colors = colour_communities(network, communities)
                style_network_communities(network, community_colors)

                # Save stuff
                community_output[threshold_name] = communities_to_series(
                    communities)

                community_colors = pd.Series(community_colors)
                community_colors.index.name = 'Community'
                community_colors.name = 'Color'
                community_color_output[threshold_name] = community_colors

            else:
                # For HC colour them blue
                colour_nodes(network, '#378cb9')

            style_network_edges(network)

            # Main network
            basename = NETWORK_FILES_BASENAME_TEMPLATE.format(threshold_name)

            if threshold_name not in MANUAL_POSITION_OVERRIDES:
                # Generate seed layout
                seed_kwargs = dict(gravity=40, scalingRatio=4)
                if threshold_name == 'high-confidence':
                    seed_kwargs = dict(gravity=100, scalingRatio=15)
                seed_pos = fa2_pos(network, **seed_kwargs)
                # Add the position information
                style_position(network, seed_pos)

                if threshold_name == 'high-confidence':
                    gephi_kwargs = dict(gravity=100,
                                        scale=15,
                                        duration=5,
                                        proportion=0.6,
                                        plot='false')
                else:
                    gephi_kwargs = dict(gravity=40,
                                        scale=4,
                                        duration=15,
                                        proportion=0.6,
                                        plot='false')

                tmp_dir = tempfile.mkdtemp()
                try:
                    gephi_forceatlas2_layout(network,
                                             basename,
                                             outdir=tmp_dir,
                                             **gephi_kwargs)
                    # Now get the positions from gephi layout back:
                    pos = parse_pos_from_1_3_gexf(
                        os.path.join(tmp_dir, basename + '.gephi.gexf'))
                finally:
                    try:
                        shutil.rmtree(tmp_dir)
                        del tmp_dir
                    except FileNotFoundError:
                        pass
            else:
                logger.info(
                    f'Using manual override for {threshold_name} network')
                override_file = MANUAL_POSITION_OVERRIDES[threshold_name]
                if override_file.endswith('.gexf'):
                    pos = parse_pos_from_1_3_gexf(override_file)
                elif override_file.endswith('.cyjs'):
                    pos = parse_pos_from_cyjs(override_file)
                else:
                    raise NotImplementedError(
                        f'Cannot parse override file for {threshold_name}')

                for node in network.nodes:
                    assert node in pos, f'Override network does not contain node {node!r}'

            # Update networkx layout to have it
            style_position(network, pos)

            # Draw the networkf
            if threshold_name == 'high-confidence':
                gephi_draw_kwargs = dict(duration=0,
                                         plot='true',
                                         rescaleedgeweight='true',
                                         minweight=2.0,
                                         maxweight=2.0,
                                         edgecolor="ORIGINAL",
                                         straight="true",
                                         nodeborderwidth=1.0)
            else:
                gephi_draw_kwargs = dict(duration=0,
                                         plot='true',
                                         rescaleedgeweight='true',
                                         minweight=2.0,
                                         maxweight=20.0,
                                         nodeborderwidth=0.0)
            gephi_forceatlas2_layout(network,
                                     basename,
                                     outdir=NETWORK_GRAPH_OUTPUT_DIRECTORY,
                                     **gephi_draw_kwargs)

            # Also make it in dataframe form, and attach it to node_meta
            pos_as_df = []
            for ix, (x, y) in pos.items():
                pos_as_df.append([ix, x, y])

            pos_as_df = pd.DataFrame(
                pos_as_df,
                columns=[meta.index.name, 'network_pos_x',
                         'network_pos_y']).set_index(meta.index.name)

            pos_output[threshold_name] = pos_as_df

            # Let networkx write gexf for itself. At this point it doesn't support gexf v1.3
            # so it won't be able to read the forceatlas2 output...
            write_gexf_compatible_with_cytoscape(
                network,
                os.path.join(NETWORK_GRAPH_OUTPUT_DIRECTORY,
                             basename + NETWORKX_GEXF_SUFFIX))

    main_communities = community_output[SELECTED_FDR_THRESHOLD]
    main_pos = pos_output[SELECTED_FDR_THRESHOLD]

    meta = meta.join(n_significant).join(n_nonzero).join(main_communities)
    meta = meta.join(main_pos)

    with pd.HDFStore(OUTPUT_HDF_FILE, 'w', complevel=9,
                     complib='lzo') as output_store:
        output_store['input/random_state'] = pd.Series(RANDOM_STATE)

        output_store['output/edge_statistics'] = edge_statistics
        output_store['output/node_meta'] = meta

        for threshold in community_output:
            output_store['output/communities/{}/members'.format(
                threshold)] = community_output[threshold]
            output_store['output/communities/{}/colors'.format(
                threshold)] = community_color_output[threshold]
Exemple #20
0
def extract_communities(graph, collapse_satellites=True):
    """
    Extracts graph communities using `community` package.

    Automatically collapses satellite nodes to "satellite" community.

    :param graph:
    :param collapse_satellites:
    :return:
    """
    logger = get_logger('extract_communities')

    communities = community.best_partition(graph)

    community_partition = {}

    for k, v in communities.items():
        try:
            community_partition[v].append(k)
        except KeyError:
            community_partition[v] = [k]

    logger.info('Data partitioned into {:,} communities'.format(len(community_partition)))

    # Name communities based on protein with largest betweeness centrality
    community_names = {}
    for id_, members in community_partition.items():
        subgraph = graph.subgraph(members)

        centralities = nx.algorithms.centrality.closeness_centrality(subgraph)

        # Sort by
        sorted_centralities = sorted(centralities.items(),
                                     # We want sort increasing by centrality
                                     # decreasing by key, thus -x[1]
                                     key=lambda x: (-x[1], x[0]),
                                     )

        name = sorted_centralities[0][0]
        community_names[id_] = name

    community_members = {community_names[k]: frozenset(v) for k, v in community_partition.items()}

    if collapse_satellites:
        block = community_blocks(graph, community_members)
        degrees = dict(nx.degree(block))

        satellites = set()

        MIN_NODES = 5

        for node, degree in degrees.items():
            if degree == 0 and len(community_members[node]) < MIN_NODES:
                satellites.add(node)

        no_satellite_communities = {k: v for k, v in community_members.items() if
                                    k not in satellites}
        satellite_members = set()
        for satellite in satellites:
            satellite_members.update(community_members[satellite])

        no_satellite_communities['satellites'] = frozenset(satellite_members)

        community_members = no_satellite_communities

    return community_members
def main():
    ensure_directory_exists(OUTPUT_FILE)
    logger = get_logger('models.ptm_response.main')
    long_matrices, network_df = longform_matrices_of_informative_nucleosomes()
    pulldown_predictors = pd.read_hdf(PULLDOWN_META, '/meta/predictors')

    with pd.HDFStore(OUTPUT_FILE, 'w') as store:

        joint_limma_stats = []

        joint_camera_complexes = []

        store[f'/ptm_stats/network_df'] = network_df

        for predictor, long_matrix in long_matrices.items():
            store[f'/ptm_stats/{predictor}/long_matrix'] = long_matrix
            n_edges = long_matrix.reset_index()['edge'].nunique()
            if n_edges < MIN_N_EDGES:
                logger.info(f'Not analysing {predictor} as it has only {n_edges:,} supporting edges')
                continue

            matrix, design, weight = to_matrix_design_and_weights(long_matrix,
                                                                  min_unimputed=MIN_UNIMPUTED)

            store[f'/ptm_stats/{predictor}/matrix'] = matrix
            store[f'/ptm_stats/{predictor}/design'] = design
            store[f'/ptm_stats/{predictor}/weight'] = weight

            n = len(matrix)
            logger.info(f'Will analyse {n:,} proteins for {predictor}')
            coef = 'ptm'

            limma_result, __ = limma_fit(matrix, design, weight,
                                         coef,
                                         fdr_threshold=FDR_THRESHOLD_RESPONSE,
                                         fc_threshold=FC_THRESHOLD_RESPONSE)

            for key, value in limma_result.items():
                store[f'/ptm_stats/{predictor}/limma/{key}'] = value

            limma_stats = limma_result['stats']
            limma_stats['predictor'] = predictor
            joint_limma_stats.append(limma_stats.reset_index())

            n_non_null = (~limma_stats['logFC'].isnull()).sum()
            non_null_ratio = n_non_null / n
            logger.info(f'{n_non_null:,}/{n:,} ({non_null_ratio:.2%}) analysed proteins are non null for {predictor}')

            n_significant = limma_stats['significant'].sum()
            n_significant_with_fc = limma_stats['significant_and_large_fc'].sum()

            n_significant_ratio = n_significant / n_non_null

            logger.info(f'Limma has found: {n_significant:,}/{n_non_null:,} ({n_significant_ratio:.2%}) '
                        f'proteins respond to {predictor} (FDR {FDR_THRESHOLD_RESPONSE}), out of which '
                        f'{n_significant_with_fc:,} have FC of at least {FC_THRESHOLD_RESPONSE}')

            ce = limma_camera_complexes(matrix, design, weight,
                                        limma_stats,
                                        coef=coef,
                                        min_size=ENRICHMENT_MIN_SIZE_COMPLEXES,
                                        max_size=ENRICHMENT_MAX_SIZE_COMPLEXES)
            ce['significant'] = ce['FDR'] <= ENRICHMENT_FDR_THRESHOLD
            ce['predictor'] = predictor

            store[f'/ptm_stats/{predictor}/camera/complexes'] = ce
            joint_camera_complexes.append(ce.reset_index())

            # Main training done, now process dropouts
            dropouts = edges_for_predictor_dropouts(network_df, predictor,
                                                     pulldown_predictors,
                                                     min_edges=MIN_N_EDGES)

            dropout_list = pd.Series(list(dropouts.keys()), name=f'Dropouts for {predictor}')
            store[f'/ptm_stats/{predictor}/dropout/list'] = dropout_list

            for dropout, remaining_edges in dropouts.items():
                submatrix = long_matrix.loc(axis=0)[:, remaining_edges]

                matrix, design, weight = to_matrix_design_and_weights(submatrix,
                                                              min_unimputed=MIN_UNIMPUTED)
                store[f'/ptm_stats/{predictor}/dropout/{dropout}/matrix'] = matrix
                store[f'/ptm_stats/{predictor}/dropout/{dropout}/design'] = design
                store[f'/ptm_stats/{predictor}/dropout/{dropout}/weight'] = weight

                n = len(matrix)
                logger.info(f'Will analyse {n:,} proteins for {predictor} dropout {dropout}')

                limma_result, __ = limma_fit(matrix, design, weight,
                                             'ptm',
                                             fdr_threshold=FDR_THRESHOLD_RESPONSE,
                                             fc_threshold=FC_THRESHOLD_RESPONSE)

                for key, value in limma_result.items():
                    store[f'/ptm_stats/{predictor}/dropout/{dropout}/limma/{key}'] = value

        joint_limma_stats = pd.concat(joint_limma_stats, ignore_index=True).set_index(['predictor', 'Gene label'])
        store[f'/ptm_stats/joint_limma_stats'] = joint_limma_stats

        joint_camera_complexes = pd.concat(joint_camera_complexes, ignore_index=True).set_index(['predictor',
                                                                                                 'Complex'])

        store[f'/ptm_stats/joint_camera_complexes'] = joint_camera_complexes