Beispiel #1
0
def download(dataset, version="latest", redownload=False):

    dataset = dataset.lower()

    if dataset.startswith("pdc"):
        return _pdc_download(dataset, version=version, redownload=redownload)

    elif dataset.startswith("pancan") or dataset == "all":
        box_token = get_box_token()

        if dataset == "pancanbrca":
            sources = BRCA_SOURCES
        elif dataset == "pancanccrcc":
            sources = CCRCC_SOURCES
        elif dataset == "pancancoad":
            sources = COAD_SOURCES
        elif dataset == "pancangbm":
            sources = GBM_SOURCES
        elif dataset == "pancanhnscc":
            sources = HNSCC_SOURCES
        elif dataset == "pancanlscc":
            sources = LSCC_SOURCES
        elif dataset == "pancanluad":
            sources = LUAD_SOURCES
        elif dataset == "pancanov":
            sources = OV_SOURCES
        elif dataset == "pancanucec":
            sources = UCEC_SOURCES
        elif dataset == "all":
            sources = BRCA_SOURCES + CCRCC_SOURCES + COAD_SOURCES + GBM_SOURCES + HNSCC_SOURCES + LSCC_SOURCES + LUAD_SOURCES + OV_SOURCES + UCEC_SOURCES
        else:
            raise InvalidParameterError(f"{dataset} is not a valid dataset.")

        overall_success = True
        for source in sources:

            if source.startswith("pdc"):
                single_success = download(source,
                                          version=version,
                                          redownload=redownload)
            else:
                single_success = cptac.download(source,
                                                version=version,
                                                redownload=redownload,
                                                box_auth=True,
                                                box_token=box_token)

            if not single_success:
                overall_success = False

        return overall_success

    else:
        return cptac.download(dataset,
                              version=version,
                              redownload=redownload,
                              box_auth=True)
Beispiel #2
0
def get_interacting_proteins_biogrid(protein):
    """Queries the BioGRID API to get interacting proteins for a given protein, based on curated literature references.

    Parameters:
    protein: The name of the protein that you want to generate a list of interacting proteins for.

    Returns:
    pandas.DataFrame: The interacting proteins, ranked by the number of literature references supporting them.
    """

    # Post query to the BioGRID API
    query_url = "https://webservice.thebiogrid.org/interactions/"

    params = {
        "searchNames": "true",
        "geneList": protein,
        "includeInteractors": "true",
        "includeInteractorInteractions": "false",
        "interSpeciesExcluded": "true",
        "format": "json",
        "taxId": "9606",
        "start": "0",
        "accesskey": "0ff59dcf3511928e78aad499688381c9"
    }

    query_resp = requests.get(query_url, params=params)

    # Check that the response came back good
    if query_resp.status_code != requests.codes.ok:
        raise HttpResponseError(
            f"Submitting your query to the STRING API returned an HTTP status {query_resp.status_code}. The content returned from the request may be helpful:\n{query_resp.content.decode('utf-8')}"
        )

    elif len(query_resp.json()) == 0:
        raise InvalidParameterError(
            f"No interactors found for '{protein}'. Are you sure you entered the identifier correctly?"
        )

    # Put the response data in a dataframe
    resp_df = pd.DataFrame(query_resp.json()).transpose()

    # Get a list of all the interactors, and rank them by how many references each has
    interactors = resp_df["OFFICIAL_SYMBOL_A"].\
    where(resp_df["OFFICIAL_SYMBOL_A"] != protein, other=resp_df["OFFICIAL_SYMBOL_B"]).\
    value_counts().\
    to_frame("num_references")

    interactors.index.name = "protein"

    return interactors
Beispiel #3
0
def permutation_test_corr(data, num_permutations):
    """Use permutation testing to calculate a P value for the linear correlation coefficient between two variables in several samples. You would use this if your distribution didn't follow the Pearson correlation test's assumption of being bivariate normal.

    Parameters:
    data (pandas.DataFrame): A dataframe where the rows are samples, and the columns are the two variables we're testing correlation between.

    Returns:
    float: The linear correlation coefficient for the two variables.
    float: The P value for the null hypothesis that the correlation coefficient is zero.
    """

    # Check the table dimensions
    if data.shape[1] != 2:
        raise InvalidParameterError(
            f"Expected 2 columns in dataframe. Found {data.shape[1]}.")

    # Drop NaN values
    data = data.dropna()

    # Extract the values
    var1 = data.iloc[:, 0].values
    var2 = data.iloc[:, 1].values

    # Create an independent pseudo-random number generator
    generator = np.random.RandomState(0)

    # Calculate the actual correlation coefficient
    actual_coef = np.corrcoef(var1, var2)[0, 1]

    extreme_count = 0

    for i in range(num_permutations):
        var1_perm = generator.permutation(var1)
        perm_coef = np.corrcoef(var1_perm, var2)[0, 1]

        # Keep count of how many are as or more extreme than our coefficient
        if abs(perm_coef) >= abs(
                actual_coef
        ):  # We compare the absolute values for a two-tailed test
            extreme_count += 1

    # Calculate the P value
    P_val = extreme_count / num_permutations  # Don't need to multiply by 2 because we compared the absolute values of coefficients.

    return actual_coef, P_val
Beispiel #4
0
def wrap_ttest(df,
               label_column,
               comparison_columns=None,
               alpha=.05,
               equal_var=True,
               return_all=False,
               correction_method='bonferroni',
               mincount=3,
               pval_return_corrected=True):
    try:
        '''Verify precondition that label column exists and has exactly 2 unique values'''
        label_values = df[label_column].unique()
        if len(label_values) != 2:
            print(
                "Incorrectly Formatted Dataframe! Label column must have exactly 2 unique values."
            )
            return None
        '''Partition dataframe into two sets, one for each of the two unique values from the label column'''
        partition1 = df.loc[df[label_column] == label_values[0]]
        partition2 = df.loc[df[label_column] == label_values[1]]
        '''If no comparison columns specified, use all columns except the specified labed column'''
        if not comparison_columns:
            comparison_columns = list(df.columns)
            comparison_columns.remove(label_column)
        '''Determine the number of real valued columns on which we will do t-tests'''
        number_of_comparisons = len(comparison_columns)
        '''Store comparisons and p-values in two arrays'''
        comparisons = []
        pvals = []
        '''Loop through each comparison column, perform the t-test, and record the p-val'''

        for column in comparison_columns:
            if len(partition1[column].dropna(axis=0)) <= mincount:
                continue
            elif len(partition2[column].dropna(axis=0)) <= mincount:
                continue
            else:
                stat, pval = scipy.stats.ttest_ind(
                    a=partition1[column].dropna(axis=0),
                    b=partition2[column].dropna(axis=0),
                    equal_var=equal_var)

                comparisons.append(column)
                pvals.append(pval)

        if len(
                pvals
        ) == 0:  # None of the groups had enough members to pass the mincount
            raise InvalidParameterError(
                "No groups had enough members to pass mincount; no tests run.")
        '''Correct for multiple testing to determine if each comparison meets the new cutoff'''
        results = statsmodels.stats.multitest.multipletests(
            pvals=pvals, alpha=alpha, method=correction_method)
        reject = results[0]
        '''Format results in a pandas dataframe'''
        results_df = pd.DataFrame(columns=['Comparison', 'P_Value'])
        '''If return all, add all comparisons and p-values to dataframe'''
        if return_all:
            if pval_return_corrected:
                results_df['Comparison'] = comparisons
                results_df['P_Value'] = results[1]

            else:
                results_df['Comparison'] = comparisons
                results_df['P_Value'] = pvals
            '''Else only add significant comparisons'''
        else:
            for i in range(0, len(reject)):
                if reject[i]:
                    if pval_return_corrected:
                        results_df = results_df.append(
                            {
                                'Comparison': comparisons[i],
                                'P_Value': results[1][i]
                            },
                            ignore_index=True)
                    else:
                        results_df = results_df.append(
                            {
                                'Comparison': comparisons[i],
                                'P_Value': pvals[i]
                            },
                            ignore_index=True)
        '''Sort dataframe by ascending p-value'''
        results_df = results_df.sort_values(by='P_Value', ascending=True)
        results_df = results_df.reset_index(drop=True)
        '''If results df is not empty, return it, else return None'''
        if len(results_df) > 0:
            return results_df
        else:
            return None

    except:
        print("Incorrectly Formatted Dataframe!")
        return None
Beispiel #5
0
def reduce_multiindex(df, levels_to_drop=None, flatten=False, sep='_', tuples=False, quiet=False):
    """Drop levels from and/or flatten the column axis of a dataframe with a column multiindex.

    Parameters:
    df (pandas.DataFrame): The dataframe to make the changes to.
    levels_to_drop (str, int, or list or array-like of str or int, optional): Levels, or indices of levels, to drop from the dataframe's column multiindex. These must match the names or indices of actual levels of the multiindex. Must be either all strings, or all ints. Default of None will drop no levels.
    flatten (bool, optional): Whether or not to flatten the multiindex. Default of False will not flatten. Cannot be used if tuples=True.
    sep (str, optional): String to use to separate index levels when flattening. Default is underscore. Only relevant if flatten=True.
    tuples (bool, optional): Whether to return the multiindex as a single-level index of tuples. Cannot be used if flatten=True. Default False.
    quiet (bool, optional): Whether to suppress warnings if duplicate column headers being created when column index levels are dropped, or if you tried to flatten or tuple-ify an index with only one level. Default False.

    Returns:
    pandas.DataFrame: The dataframe, with the desired column index changes made.
    """
    # Parameter check
    if flatten and tuples:
        raise InvalidParameterError("You passed 'True' for both 'flatten' and 'tuples'. This is an invalid combination of arguments. Either pass 'True' to 'flatten' to combine index levels and make a single-level index of strings, or pass 'True' to 'tuples' to return a single-level index of tuples; but just pick one or the other.")

    # Make a copy, so the original dataframe is preserved
    df = df.copy(deep=True)

    if levels_to_drop is not None:
        if df.columns.nlevels < 2:
            raise DropFromSingleIndexError("You attempted to drop level(s) from an index with only one level.")

        if isinstance(levels_to_drop, (str, int)):
            levels_to_drop = [levels_to_drop]
        elif not isinstance(levels_to_drop, (list, pd.Series, pd.Index)):
            raise InvalidParameterError(f"Parameter 'levels_to_drop' is of invalid type {type(levels_to_drop)}. Valid types: str, int, list or array-like of str or int, or NoneType.")

        # Check that they're not trying to drop too many columns
        existing_len = len(df.columns.names)
        to_drop_len = len(levels_to_drop)
        if to_drop_len >= existing_len:
            raise InvalidParameterError(f"You tried to drop too many levels from the dataframe column index. The most levels you can drop is one less than however many exist. {existing_len} levels exist; you tried to drop {to_drop_len}.")

        # Check that the levels they want to drop all exist
        to_drop_set = set(levels_to_drop)
        if all(isinstance(level, int) for level in to_drop_set):
            existing_set_indices = set(range(len(df.columns.names)))
            if not to_drop_set <= existing_set_indices:
                raise InvalidParameterError(f"Some level indices in {levels_to_drop} do not exist in dataframe column index, so they cannot be dropped. Existing column level indices: {list(range(len(df.columns.names)))}")
        else:
            existing_set = set(df.columns.names)
            if not to_drop_set <= existing_set:
                raise InvalidParameterError(f"Some levels in {levels_to_drop} do not exist in dataframe column index, so they cannot be dropped. Existing column levels: {df.columns.names}")

        df.columns = df.columns.droplevel(levels_to_drop)

        num_dups = df.columns.duplicated(keep=False).sum()
        if num_dups > 0 and not quiet:
            warnings.warn(f"Due to dropping the specified levels, dataframe now has {num_dups} duplicated column headers.", DuplicateColumnHeaderWarning, stacklevel=2)

    if flatten:
        if df.columns.nlevels < 2 and not quiet:
            warnings.warn("You tried to flatten a column index that didn't have multiple levels, so we didn't actually change anything.", FlattenSingleIndexWarning, stacklevel=2)
            return df

        tuples = df.columns.to_flat_index() # Converts multiindex to an index of tuples
        no_nan = tuples.map(lambda x: [item for item in x if pd.notnull(item) and item != ""]) # Cut any NaNs and empty strings out of tuples
        joined = no_nan.map(lambda x: sep.join(x)) # Join each tuple
        df.columns = joined
        df.columns.name = "Name" # For consistency
    elif tuples:
        if df.columns.nlevels < 2 and not quiet:
            warnings.warn("You tried to turn a column index into tuples, but it didn't have multiple levels so we didn't actually change anything.", FlattenSingleIndexWarning, stacklevel=2)
            return df

        df.columns = df.columns.to_flat_index()

    return df
Beispiel #6
0
def _pdc_download(dataset, version, redownload):
    """Download data for the specified cancer type from the PDC."""

    dataset = str.lower(dataset)

    if dataset == "pdcall":
        overall_result = True
        for dataset in STUDY_IDS_MAP.keys():
            if not pdc_download(dataset, version, redownload):
                overall_result = False

        return overall_result

    if not dataset.startswith("pdc"):
        raise InvalidParameterError(
            f"pdc_download function can only be used for PDC datasets, which start with the prefix 'pdc'. You tried to download '{dataset}'."
        )

    if dataset not in STUDY_IDS_MAP.keys():
        raise InvalidParameterError(
            f"PDC dataset must be one of the following:\n{list(STUDY_IDS_MAP.keys())}\nYou passed '{dataset}'."
        )

    dataset_ids = STUDY_IDS_MAP[dataset]

    # Get the directory to where to store the data, and see if it exists
    path_here = os.path.abspath(os.path.dirname(__file__))
    cancer_dir = os.path.join(path_here, f"data_{dataset}")

    if os.path.isdir(cancer_dir):

        # Only redownload if they explicitly wanted that
        if redownload:
            shutil.rmtree(cancer_dir)
        else:
            return True

    os.mkdir(cancer_dir)
    data_dir = os.path.join(cancer_dir, f"{dataset}_v0.0")
    os.mkdir(data_dir)

    # We'll combine all the clinical tables in case there are differences
    master_clin = pd.DataFrame()

    for data_type in dataset_ids.keys():

        # Print an update
        download_msg = f"Downloading {dataset} {data_type} files..."
        print(download_msg, end="\r")

        # Get the clinical and quantitative tables for the study ID
        clin, quant = download_pdc_id(dataset_ids[data_type],
                                      _download_msg=False)

        # Print a new update
        print(" " * len(download_msg), end="\r")
        save_msg = f"Saving {dataset} {data_type} files..."
        print(save_msg, end="\r")

        # Append the clinical dataframe
        master_clin = master_clin.append(clin)

        # Save the quantitative table
        quant.to_csv(os.path.join(data_dir, f"{data_type}.tsv.gz"), sep="\t")

        # Erase update
        print(" " * len(save_msg), end="\r")

    # Print an update
    save_msg = f"Saving {dataset} clinical file..."
    print(save_msg, end="\r")

    # Drop any duplicated rows in combined clinical table, then save it too
    master_clin = master_clin.drop_duplicates(keep="first")

    master_clin.to_csv(os.path.join(data_dir, "clinical.tsv.gz"), sep="\t")

    # Write a dummy index with just version numbers
    index_path = os.path.join(cancer_dir, "index.txt")

    with open(index_path, "w") as index_file:
        index_file.write("#0.0\n")

    # Erase update
    print(" " * len(save_msg), end="\r")

    return True
Beispiel #7
0
def reactome_enrichment_analysis(analysis_type,
                                 data,
                                 sort_by,
                                 ascending,
                                 include_high_level_diagrams=True,
                                 disease_pathways=True,
                                 include_interactors=False):
    """Use the Reactome Analysis Service API to do a gene set enrichment analysis.

    Parameters:
    analysis_type (str): The type of enrichment analysis you want to perform. Either "ranked" or "nonranked".
    data (pandas.DataFrame or pandas.Series, or list or array-like): The data you want to overlay. Format depends on the analysis type.
        If ranked analysis:
            - data is a DataFrame or Series where the index is unique gene/protein identifiers and column(s) are ranking values (e.g. expression values for genes).
            - Multiple data columns allowed and are analyzed as separate ranked enrichment analyses.
            - All dtypes must be numeric.
        If unranked analysis:
            - data is a list or pandas.Index of identifiers to test pathways for enrichment with.
    sort_by (str): The metric by which to sort the pathways when selecting the top ones. You can pass "p_value" to sort by the P value (hypergeometric distribution), or pass one of the metrics directly supported by the Reactome API, listed below. (Yes, our function just maps "p_value" to "ENTITIES_PVALUE" .)
        "NAME",
        "TOTAL_ENTITIES",
        "TOTAL_INTERACTORS",
        "TOTAL_REACTIONS",
        "FOUND_ENTITIES",
        "FOUND_INTERACTORS",
        "FOUND_REACTIONS",
        "ENTITIES_RATIO", # Note: This value is the ratio of the total number of proteins in the pathway, to the total number of proteins in all of Reactome.
        "ENTITIES_PVALUE",
        "ENTITIES_FDR",
        "REACTIONS_RATIO",
    ascending (bool): When sorting pathways by the specified metric, whether to put smaller values first.
    include_high_level_diagrams (bool, optional): Whether to include pathway diagrams in the output that are just summaries of lower level pathways and don't show actual reactions. If False, this will exclude any Reactome pathways that have EHLD (Enhanced Higher Level Diagrams). Default True includes these pathways in results.
    disease_pathways (bool, optional): Whether to include pathways that describe disease related function. Default True.
    include_interactors (bool, optional): Whether to include computationally inferred interactors when identifying pathways that are enriched with your submitted proteins/genes. Default False. You may want to set this to True if a large portion of the identifiers you submitted do not match a Reactome pathway when it is set to False.

    Returns:
    pandas.DataFrame: A dataframe with info on the top enriched pathways, sorted by the specified metric.
    """
    # Check the sort_by parameter
    if sort_by == "p_value":
        parsed_sort_by = "ENTITIES_PVALUE"
    else:
        parsed_sort_by = sort_by

    valid_sort_bys = [
        "NAME", "TOTAL_ENTITIES", "TOTAL_INTERACTORS", "TOTAL_REACTIONS",
        "FOUND_ENTITIES", "FOUND_INTERACTORS", "FOUND_REACTIONS",
        "ENTITIES_RATIO", "ENTITIES_PVALUE", "ENTITIES_FDR", "REACTIONS_RATIO"
    ]

    if parsed_sort_by not in valid_sort_bys:
        newline = "\n"
        single_qt = "'"
        raise InvalidParameterError(
            f"Invalid value for 'sort_by' parameter. You passed '{sort_by}'. Must be one of the following:\n'p_value'\n{newline.join([f'{single_qt}{x}{single_qt}' for x in valid_sort_bys])}"
        )

    if analysis_type == "ranked":

        # Copy the data
        data = data.copy(deep=True)

        # If they gave us a series, make it a dataframe
        if isinstance(data, pd.Series):
            if data.name is None:
                data.name = "data"
            data = pd.DataFrame(data)

        # Check that the index is strings (gene/protein names)
        if data.index.dtype != np.dtype("object"):
            raise InvalidParameterError(
                f"The dataframe you passed does not have an index of strings. The dtype of your dataframe's index is {data.index.dtype}."
            )

        elif data.index.str.isnumeric().any():
            raise InvalidParameterError(
                f"The dataframe you passed has an index of strings, but some of the values are completely numbers as strings. Gene/protein identifier always have letters in them."
            )

        # The identifier series (the index) needs to have a name starting with "#"
        if data.index.name is None:
            data.index.name = "#identifier"
        elif not data.index.name.startswith("#"):
            data.index.name = "#" + data.index.name

        # Take care of NaNs and small numbers
        # This represents NaNs as 'nan', which Reactome is OK with
        # Also rounds all numbers without scientific notation
        data.iloc[:, 0] = data.iloc[:, 0].apply(lambda x: "{:.9f}".format(x))

        # Get the data as a tab-separated string
        data_str = data.to_csv(sep='\t')

    elif analysis_type == "unranked":

        # Format data
        data = pd.Index(data)  # Convert it to an index if it wasn't
        data = data.dropna()  # Drop NaNs
        data = data.astype(str)  # Make it strings

        # Check that they're actual gene/protein names
        if data.str.isnumeric().any():
            raise InvalidParameterError(
                f"The data you passed has some values that are completely numeric. Gene/protein identifier always have letters in them."
            )

        # The first item needs to be a column header string starting with '#'
        if not data[0].startswith("#"):
            data = data.insert(0, "#identifier")

        # Get the list as a newline-separated string
        data_str = "\n".join(data)

    else:
        raise InvalidParameterError(
            f"Invalid value for 'analysis_type' parameter. You passed '{analysis_type}'. Must be 'ranked' or 'unranked'."
        )

    # Post the data to the Reactome analysis service
    analysis_url = "https://reactome.org/AnalysisService/identifiers/projection"
    headers = {"Content-Type": "text/plain"}
    params = {
        "interactors": include_interactors,
        "sortBy": parsed_sort_by,
        "order": "ASC" if ascending else "DESC",
        "includeDisease": disease_pathways,
    }

    resp = requests.post(analysis_url,
                         headers=headers,
                         params=params,
                         data=data_str)

    # Check that the response came back good
    if resp.status_code != requests.codes.ok:
        raise HttpResponseError(
            f"Submitting your data for analysis returned an HTTP status {resp.status_code}. The content returned from the request may be helpful:\n{resp.content.decode('utf-8')}"
        )

    warnings_list = resp.json()["warnings"]
    if len(warnings_list) != 0:
        newline = "\n"
        raise InvalidParameterError(
            f"Your analysis request returned the following warnings. You may have a data formatting problem. Check that your data matches the format specified in the docstring. Here's up to the first ten warnings:\n{newline.join(warnings_list[0:len(warnings_list)] if len(warnings_list) < 10 else warnings_list[0:10] + ['...'])}"
        )

    # Process the JSON response
    resp_json = resp.json()
    analysis_token = resp_json["summary"]["token"]
    pathways_table = pd.json_normalize(resp_json["pathways"], sep="_")

    # Select the columns we want
    pathways_table = pathways_table[[
        "stId", "name", "entities_ratio", "entities_pValue", "entities_fdr",
        "entities_found", "entities_total"
    ]]

    # If requested, filter out EHLD pathways
    if not include_high_level_diagrams:

        # Download a list of all diagrams with EHLD diagrams
        ehld_url = "https://reactome.org/download/current/ehld/svgsummary.txt"
        ehld_resp = requests.get(ehld_url)

        # Check that the response came back good
        if ehld_resp.status_code != requests.codes.ok:
            raise HttpResponseError(
                f"Checking whether pathways are high level pathways returned an HTTP status {ehld_resp.status_code}. The content returned from the request may be helpful:\n{ehld_resp.content.decode('utf-8')}"
            )

        # Parse the response
        ehld_list = ehld_resp.content.decode("utf-8")
        ehld_list = ehld_list.split("\n")
        ehld_list = [
            pathway_id for pathway_id in ehld_list
            if pathway_id.startswith("R-HSA-")
        ]

        has_ehld = pathways_table["stId"].isin(ehld_list)
        pathways_table = pathways_table[~has_ehld]

        # Make the index look normal
        pathways_table = pathways_table.reset_index(drop=True)

    return analysis_token, pathways_table
Beispiel #8
0
def reactome_pathway_overlay(pathway,
                             df=None,
                             analysis_token=None,
                             open_browser=True,
                             export_path=None,
                             image_format="png",
                             display_col_idx=0,
                             diagram_colors="Modern",
                             overlay_colors="Standard",
                             quality=7):
    """Visualize numerical data (e.g. protein expression) on a Reactome pathway diagram, with each node's color corresponding to the expression value provided for that molecule.

    Parameters:
    pathway (str): The Reactome ID for the pathway you want to overlay the data on, e.g. "R-HSA-73929".
    df (pandas.DataFrame or pandas.Series, optional): If you haven't previously analyzed your data with Reactome, give this parameter the data you want to overlay. Each row corresponds to a particular gene/protein/etc, and each column is expression or other data for a sample or aggregate. Index must be unique identifiers. Multiple data columns allowed. All dtypes must be numeric. Default None assumes you are instead passing a token for previously analyzed data to the "analysis_token" parameter.
    analysis_token (str, optional): If the data you want to visualize has been recently analyzed in Reactome already, pass the token for that analysis to this parameter to overlay it on the specified pathway. This will allow this function to reaccess the archived results, thus avoiding wasting time by repeating the work of submitting and analyzing the data. Default of None assumes you are instead passing data to the "df" parameter.
    open_browser (bool, optional): Whether to automatically open the diagram in a new web browser tab. Default True.
    export_path (str, optional): A string providing a path to export the diagram to. Must end in a file name with the same extension as the "image_format" parameter. Default None causes no figure to be exported.
    image_format (str, optional): If export_path is not none, this specifies the format to export the diagram to. Options are "png", "gif", "svg", "jpg", "jpeg", or "pptx". Must match the file extension in the export path. If you're creating a gif and you want more than one column's data to be included in the image, make sure to pass None to the display_col_idx parameter. Default "png".
    display_col_idx (int, optional): If export_path is not none, this specifies which column in the dataframe to overlay expression data from. Must be a valid column index for the given table, or None. None will cause the first column to be displayed, unless you're creating a gif, in which case it will cause all columns to be included in the gif. Default None.
    diagram_colors (str, optional): If export_path is not none, this specifies the Reactome color scheme to use for the underlying diagram. Options are "Modern" or "Standard". Default "Modern".
    overlay_colors (str, optional): If export_path is not none, this specifies the Reactome color scheme to use for the data overlay. Options are "Standard", "Strosobar", or "Copper Plus". Default "Standard".
    quality (int, optional): If export_path is not none, this specifies what relative quality to export the image at. Must be between 1 and 10 inclusive. Default 7.

    Returns:
    list of float: The mean of the data values for all proteins in the pathways, for each column in the data table, in the order of the columns in the data table. I.e. each value in this list is the average of the data from a particular column for all the proteins in the pathway.
    str: If export_path is None, returns URL to diagram with data overlaid in Reactome Pathway Browser. Otherwise returns the path the image was exported to.
    """
    # Parameter checking
    if df is None and analysis_token is None:
        raise InvalidParameterError(
            "You passed None to both the 'df' and 'analysis_token' parameters. You must pass a value to one of them."
        )

    if df is not None and analysis_token is not None:
        raise InvalidParameterError(
            "You passed values to both the 'df' and 'analysis_token' parameters. You may only pass a value to one of them."
        )

    if export_path is not None:

        if image_format not in ("png", "gif", "svg", "jpg", "jpeg", "pptx"):
            raise InvalidParameterError(
                f"Invalid value for 'image_format' parameter. Valid options are 'png', 'gif', 'svg', 'jpg', 'jpeg', or 'pptx'. You passed '{image_format}'."
            )

        if display_col_idx is None:
            display_col_idx = ""
        elif df is not None and display_col_idx not in range(
                0, df.shape[1] if isinstance(df, pd.DataFrame) else 1):
            raise InvalidParameterError(
                f"Invalid value for 'display_col_idx' parameter. Must be either None, or an int between 0 and one less than the number of columns in df (which is {df.shape[1] - 1} for this df), inclusive. You passed {display_col_idx}."
            )

        if diagram_colors not in ("Modern", "Standard"):
            raise InvalidParameterError(
                f"Invalid value for 'diagram_colors' parameter. Valid options are 'Modern' or 'Standard'. You passed '{diagram_colors}'."
            )

        if overlay_colors not in ("Standard", "Strosobar", "Copper Plus"):
            raise InvalidParameterError(
                f"Invalid value for 'overlay_colors' parameter. Valid options are 'Standard', 'Strosobar', or 'Copper Plus'. You passed '{overlay_colors}'."
            )

        if quality not in range(1, 11):
            raise InvalidParameterError(
                f"Invalid value for 'quality' parameter. Must be an int between 1 and 10 inclusive. You passed {quality}."
            )

        if image_format != export_path.split('.')[-1]:
            raise InvalidParameterError(
                f"The file extension in the 'export_path' parameter must match the 'image_format' parameter. For the image_format parameter, you passed '{image_format}'. The extension at the end of your export path was '{export_path.split('.')[-1]}'."
            )

        if export_path[:2] == "~/":
            raise InvalidParameterError(
                "The export path you provided appeared to start with a reference to the user home directory. To avoid confusion, this function will not expand that reference. Please provide a full path instead."
            )

    if df is not None:

        df = df.copy(deep=True)

        # If they gave us a series, make it a dataframe
        if isinstance(df, pd.Series):
            if df.name is None:
                df.name = "data"
            df = pd.DataFrame(df)

        # Check that the index is strings (gene/protein names)
        if df.index.dtype != np.dtype("object"):
            raise InvalidParameterError(
                f"The dataframe you passed does not have an index of strings. The dtype of your dataframe's index is {df.index.dtype}."
            )

        elif df.index.str.isnumeric().any():
            raise InvalidParameterError(
                f"The dataframe you passed has an index of strings, but some of the values are completely numbers as strings. Gene/protein identifier always have letters in them."
            )

        # The identifier series (the index) needs to have a name starting with "#"
        if df.index.name is None:
            df.index.name = "#identifier"
        elif not df.index.name.startswith("#"):
            df.index.name = "#" + df.index.name

        # Take care of NaNs
        df = df.astype(
            str)  # This represents NaNs as 'nan', which Reactome is OK with

        # Get the df as a tab-separated string
        df_str = df.to_csv(sep='\t')

        # Post the data to the Reactome analysis service
        analysis_url = "https://reactome.org/AnalysisService/identifiers/projection"
        headers = {"Content-Type": "text/plain"}
        params = {
            "pageSize": "0",
            "page": "1"
        }  # We only need the analysis token, so set pageSize to 0 so we don't worry about getting any of the data for individual pathways.

        view_resp = requests.post(analysis_url,
                                  headers=headers,
                                  params=params,
                                  data=df_str)

        # Check that the response came back good
        if view_resp.status_code != requests.codes.ok:
            raise HttpResponseError(
                f"Submitting your data for analysis returned an HTTP status {view_resp.status_code}. The content returned from the request may be helpful:\n{view_resp.content.decode('utf-8')}"
            )

        # Get the token for accessing the analysis results
        token = view_resp.json()["summary"]["token"]

    else:
        token = analysis_token

    # Get the mean data values
    expr_url = f"https://reactome.org/AnalysisService/token/{token}/filter/pathways?resource=TOTAL&pValue=1"

    headers = {
        "accept": "application/json",
        "content-type": "text/plain",
    }

    expr_resp = requests.post(expr_url, headers=headers, data=pathway)

    # Check that the response came back good
    if expr_resp.status_code != requests.codes.ok:
        raise HttpResponseError(
            f"Submitting your data for analysis returned an HTTP status {expr_resp.status_code}. The content returned from the request may be helpful:\n{expr_resp.content.decode('utf-8')}"
        )

    # Get the expression list
    expr_json = expr_resp.json()

    if len(expr_json) > 0:
        expr_list = expr_resp.json()[0]["entities"]["exp"]
    else:
        expr_list = []

    # Use the token and the pathway ID to open the pathway diagram with the data overlaid in the Reactome Pathway Browser
    viewer_url = f"https://reactome.org/PathwayBrowser/#/{pathway}&DTAB=AN&ANALYSIS={token}"
    if open_browser:
        webbrowser.open(viewer_url)

    if export_path is not None:

        # Get the diagram
        export_url = f"https://reactome.org/ContentService/exporter/diagram/{pathway}.{image_format}?token={token}&resource=TOTAL&diagramProfile={diagram_colors}&analysisProfile={overlay_colors}&expColumn={display_col_idx}&quality={quality}"
        export_resp = requests.get(export_url)

        # Check that the response came back good
        if export_resp.status_code != requests.codes.ok:
            raise HttpResponseError(
                f"Submitting your data for analysis returned an HTTP status {export_resp.status_code}. The content returned from the request may be helpful:\n{export_resp.content.decode('utf-8')}"
            )

        # Save the image
        with open(export_path, 'wb') as dest:
            dest.write(export_resp.content)

    if export_path is None:
        return expr_list, viewer_url
    else:
        return expr_list, export_path
Beispiel #9
0
def get_proteins_in_pathways(pathways, database, quiet=False):
    """Query either the Reactome REST API or the downloaded WikiPathways dataframe to get a list of proteins contained in a particular pathway.

    Parameters:
    pathways (str or list of str): The pathway(s) to get the contained proteins for. If using Reactome, these must be pathway IDs (e.g. "R-HSA-140877").
    database (str): The database to use; either 'reactome' or 'wikipathways'.
    quiet (bool, optional): Whether to suppress warnings issued when identifiers are not found. Default False.

    Returns:
    pandas.DataFrame: The proteins contained in the pathways.
    """

    # Process string input
    if isinstance(pathways, str):
        pathways = [pathways]

    if database.lower() == "reactome":
        # Set headers and url
        headers = {"accept": "application/json"}

        # Loop over ids and get the interacting pathways
        all_protein_df = pd.DataFrame()
        for pathway_id in pathways:

            # Send the request
            url = f"https://reactome.org/ContentService/data/participants/{pathway_id}"
            resp = requests.get(url, headers=headers)

            if resp.status_code == 404 or (
                    resp.status_code == requests.codes.ok and
                (len(resp.content.decode("utf-8")) == 0
                 or len(resp.json()) == 0)):
                if not quiet:
                    warnings.warn(
                        f"The query for '{pathway_id}' found no results. You may have mistyped the pathway ID.",
                        ParameterWarning,
                        stacklevel=2)
                continue
            elif resp.status_code != requests.codes.ok:
                raise HttpResponseError(
                    f"Your query returned an HTTP status {resp.status_code}. The content returned from the request may be helpful:\n{resp.content.decode('utf-8')}"
                )

            # Parse all the proteins/genes out of the response
            members_df = pd.json_normalize(resp.json(),
                                           record_path=["refEntities"])
            prot_df = members_df[members_df["displayName"].str.startswith(
                "UniProt:")]

            prot_names = prot_df["displayName"].str.rsplit(" ", n=1, expand=True)[1].\
                drop_duplicates(keep="first").\
                sort_values().\
                reset_index(drop=True)

            pathway_df = pd.DataFrame({
                "pathway": pathway_id,
                "member": prot_names
            })
            all_protein_df = all_protein_df.append(pathway_df)

        all_protein_df = all_protein_df.drop_duplicates(keep="first")

    elif database.lower() == "wikipathways":

        path_here = os.path.abspath(os.path.dirname(__file__))
        data_dir_name = "data"
        file_name = "WikiPathwaysDataframe.tsv.gz"
        file_path = os.path.join(path_here, data_dir_name, file_name)
        df = pd.read_csv(file_path, sep="\t", index_col=0)
        all_protein_df = pd.DataFrame()

        for pathway in pathways:

            if pathway in df.columns:
                prot_names = df.index[df[pathway]].values
                pathway_df = pd.DataFrame({
                    "pathway": pathway,
                    "member": prot_names
                })
                all_protein_df = all_protein_df.append(pathway_df)

            else:
                if not quiet:
                    warnings.warn(
                        f"The pathway '{pathway}' was not found in the WikiPathways data.",
                        ParameterWarning,
                        stacklevel=2)

    else:
        raise InvalidParameterError(
            f"Database '{database}' not recognized. Valid options: 'reactome', 'wikipathways'"
        )

    all_protein_df = all_protein_df.reset_index(drop=True)
    return all_protein_df
Beispiel #10
0
def get_pathways_with_proteins(proteins,
                               database,
                               reactome_resource="UniProt",
                               quiet=False):
    """Query either the Reactome REST API or the WikiPathways downloaded dataframe to find pathways containing a particular gene or protein.

    Parameters:
    proteins (str or list of str): The protein(s) to look for matches to.
    database (str): The database to use; either 'reactome' or 'wikipathways'.
    reactome_resource (str, optional): If using Reactome, this is the resource the identifier(s) come from. Default is UniProt. Other options include HGNC, Ensembl, and GO. For more options, consult <https://reactome.org/content/schema/objects/ReferenceDatabase>. This parameter is meaningless if using WikiPathways.
    quiet (bool, optional): Whether to suppress warnings issued when identifiers are not found. Default False.

    Returns:
    pandas.DataFrame: A table of pathways containing the given genes or proteins, with pathway names and, if using Reactome, their Reactome identifiers (which are needed for the pathway_overlay function).
    """

    # Process string input
    if isinstance(proteins, str):
        proteins = [proteins]

    if database.lower() == "reactome":

        # Set headers and params
        headers = {"accept": "application/json"}
        params = {"species": "H**o sapiens"}

        # Loop over proteins and get the interacting pathways
        all_pathway_df = pd.DataFrame()
        for id in proteins:
            url = f"https://reactome.org/ContentService/data/mapping/{reactome_resource}/{id}/pathways"
            resp = requests.get(url, headers=headers, params=params)

            # Check that the response came back good
            if resp.status_code == 404:
                try:
                    msg = resp.json()["messages"]
                except (json.JSONDecodeError, KeyError):
                    raise HttpResponseError(
                        f"Your query returned an HTTP status {resp.status_code}. The content returned from the request may be helpful:\n{resp.content.decode('utf-8')}"
                    ) from None
                else:
                    if not quiet:
                        warnings.warn(
                            f"The query for '{id}' returned HTTP 404 (not found). You may have mistyped the gene/protein ID or the reactome_resource name. The server gave the following message: {msg}",
                            ParameterWarning,
                            stacklevel=2)
                    continue
            elif resp.status_code != requests.codes.ok:
                raise HttpResponseError(
                    f"Your query returned an HTTP status {resp.status_code}. The content returned from the request may be helpful:\n{resp.content.decode('utf-8')}"
                )

            # Parse out pathway IDs and names
            pathway_dict = resp.json()
            names = []
            pathway_ids = []
            for pathway in pathway_dict:
                names.append(pathway["displayName"])
                pathway_ids.append(pathway["stId"])

            pathway_df = pd.DataFrame({
                "id": id,
                "pathway": names,
                "pathway_id": pathway_ids
            })
            pathway_df = pathway_df.sort_values(by="pathway_id")
            all_pathway_df = all_pathway_df.append(pathway_df)

    elif database.lower() == "wikipathways":

        path_here = os.path.abspath(os.path.dirname(__file__))
        data_dir_name = "data"
        file_name = "WikiPathwaysDataframe.tsv.gz"
        file_path = os.path.join(path_here, data_dir_name, file_name)
        df = pd.read_csv(file_path, sep="\t", index_col=0)
        all_pathway_df = pd.DataFrame()

        for protein in proteins:

            if protein in df.index:
                # Column headers are pathways; select pathways where the row for the protein has a
                # True for that pathway's column, indicating membership
                pathways = df.columns[df.loc[protein, :]].values

                prot_df = pd.DataFrame({"id": protein, "pathway": pathways})
                all_pathway_df = all_pathway_df.append(prot_df)

            else:
                if not quiet:
                    warnings.warn(
                        f"The protein '{protein}' was not found in the WikiPathways data.",
                        ParameterWarning,
                        stacklevel=2)
    else:
        raise InvalidParameterError(
            f"Database '{database}' not recognized. Valid options: 'reactome', 'wikipathways'"
        )

    all_pathway_df = all_pathway_df.reset_index(drop=True)
    return all_pathway_df
Beispiel #11
0
def _pdc_download(dataset, version, redownload, box_token):
    """Download data for the specified cancer type from the PDC."""

    dataset = str.lower(dataset)

    if dataset == "pdcall":
        overall_result = True
        for dataset in STUDY_IDS_MAP.keys():
            if not _pdc_download(dataset, version, redownload):
                overall_result = False

        return overall_result

    if not dataset.startswith("pdc"):
        raise InvalidParameterError(
            f"_pdc_download function can only be used for PDC datasets, which start with the prefix 'pdc'. You tried to download '{dataset}'."
        )

    if dataset not in STUDY_IDS_MAP.keys():
        raise InvalidParameterError(
            f"PDC dataset must be one of the following:\n{list(STUDY_IDS_MAP.keys())}\nYou passed '{dataset}'."
        )

    dataset_ids = STUDY_IDS_MAP[dataset]

    # Download the file for mapping aliquots to patient IDs
    if not cptac.download(dataset,
                          version=version,
                          redownload=redownload,
                          _box_auth=True,
                          _box_token=box_token):
        return False

    path_here = os.path.abspath(os.path.dirname(__file__))
    cancer_dir = os.path.join(path_here, f"data_{dataset}")

    # Check that the index file exists. If not, there was an uncaught error in the mapping file download.
    index_path = os.path.join(cancer_dir, "index.txt")
    if not os.path.isfile(index_path):
        raise CptacDevError(
            f"Index file not found at {index_path}. Mapping file download probably failed."
        )

    # See what data files we need to download
    data_dir = os.path.join(cancer_dir, f"{dataset}_v1.0")

    # If any of the files are missing, we're going to delete any remaining and redownload all, in case the missing files are a sign of a previous data problem
    data_files = [f"{data_type}.tsv.gz"
                  for data_type in dataset_ids.keys()] + ["clinical.tsv.gz"]
    for data_file in data_files:
        data_file_path = os.path.join(data_dir, data_file)
        if not os.path.isfile(data_file_path):
            redownload = True
            break

    if redownload:
        for data_file in data_files:
            data_file_path = os.path.join(data_dir, data_file)
            if os.path.isfile(data_file_path):
                os.remove(data_file_path)
    else:
        return True  # If all the files are there and the user didn't ask to redownload, we're done.

    # Now download all the data files

    # We'll combine all the clinical tables in case there are differences
    master_clin = pd.DataFrame()

    for data_type in dataset_ids.keys():

        # Print an update
        download_msg = f"Downloading {dataset} {data_type} files..."
        print(download_msg, end="\r")

        # Get the clinical and quantitative tables for the study ID
        clin, quant = download_pdc_id(dataset_ids[data_type],
                                      _download_msg=False)

        # Print a new update
        print(" " * len(download_msg), end="\r")
        save_msg = f"Saving {dataset} {data_type} files..."
        print(save_msg, end="\r")

        # Append the clinical dataframe
        #master_clin = master_clin.append(clin)
        master_clin = pd.concat([master_clin, clin], axis=0, join='outer')

        # Save the quantitative table
        quant.to_csv(os.path.join(data_dir, f"{data_type}.tsv.gz"), sep="\t")

        # Erase update
        print(" " * len(save_msg), end="\r")

    # Print an update
    save_msg = f"Saving {dataset} clinical file..."
    print(save_msg, end="\r")

    # Drop any duplicated rows in combined clinical table, then save it too
    master_clin = master_clin.drop_duplicates(keep="first")

    master_clin.to_csv(os.path.join(data_dir, "clinical.tsv.gz"), sep="\t")

    # Erase update
    print(" " * len(save_msg), end="\r")

    return True