Example #1
0
def dot_plot(
    robjects,
    r_runtime_error: Exception,
    means_path: str,
    pvalues_path: str,
    output_path: str,
    output_name: str,
    rows: Optional[str] = None,
    columns: Optional[str] = None,
) -> None:
    pvalues_separator = _get_separator(os.path.splitext(pvalues_path)[-1])
    means_separator = _get_separator(os.path.splitext(means_path)[-1])
    output_extension = os.path.splitext(output_name)[-1].lower()
    filename = os.path.join(output_path, output_name)

    means_df = pd.read_csv(means_path, sep=means_separator)
    n_rows, n_cols = means_df.shape
    n_cols -= 11

    n_rows, selected_rows = selected_items(rows, n_rows)
    n_cols, selected_columns = selected_items(columns, n_cols)

    this_file_dir = os.path.dirname(os.path.realpath(__file__))
    robjects.r.source(
        os.path.join(this_file_dir, 'R/plot_dot_by_column_name.R'))
    available_names = list(robjects.globalenv.keys())
    plot_function = 'dot_plot'

    if plot_function in available_names:
        function_name = plot_function
    else:
        raise MissingPlotterFunctionException()

    plotter = robjects.r[function_name]

    try:
        plotter(selected_rows=selected_rows,
                selected_columns=selected_columns,
                filename=filename,
                width=int(5 + max(3, n_cols * 0.8)),
                height=int(5 + max(5, n_rows * 0.5)),
                means_path=means_path,
                pvalues_path=pvalues_path,
                means_separator=means_separator,
                pvalues_separator=pvalues_separator,
                output_extension=output_extension)
    except r_runtime_error as e:
        raise RRuntimeException(e)
def generate_complex(user_complex: Optional[str], result_path: str, log_file: str):
    output_path = _set_paths(output_dir, result_path)
    log_path = '{}/{}'.format(output_path, log_file)

    curated_complex = pd.read_csv(os.path.join(data_dir, 'sources/complex_curated.csv'))
    if user_complex:
        separator = _get_separator(os.path.splitext(user_complex)[-1])
        user_complex = pd.read_csv(user_complex, sep=separator)

    result = complex_generator(curated_complex, user_complex, log_path)

    result.to_csv('{}/{}'.format(output_path, 'complex_generated.csv'), index=False)
Example #3
0
def generate_genes(
    user_gene: Optional[str],
    fetch_uniprot: bool,
    fetch_ensembl: bool,
    result_path: str,
    project_name: str,
) -> None:
    output_path = _set_paths(result_path, project_name)

    # TODO: Add logger
    if fetch_ensembl:
        print('fetching remote ensembl data ... ', end='')
        source_url = 'http://www.ensembl.org/biomart/martservice?query={}'
        query = '<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" ' \
                'formatter = "CSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >' \
                '<Dataset name = "hsapiens_gene_ensembl" interface = "default" >' \
                '<Attribute name = "ensembl_gene_id" />' \
                '<Attribute name = "ensembl_transcript_id" />' \
                '<Attribute name = "external_gene_name" />' \
                '<Attribute name = "hgnc_symbol" />' \
                '<Attribute name = "uniprotswissprot" />' \
                '</Dataset>' \
                '</Query>'

        url = source_url.format(urllib.parse.quote(query))
        ensembl_db = pd.read_csv(url)
        print('done')
    else:
        ensembl_db = utils.read_data_table_from_file(
            os.path.join(data_dir, 'sources/ensembl.txt'))
        print('read local ensembl file')

    # additional data comes from given file or uniprot remote url
    if fetch_uniprot:
        try:
            print('fetching remote uniprot file ... ', end='')
            source_url = 'https://www.uniprot.org/uniprot/?query=*&format=tab&force=true' \
                         '&columns=id,entry%20name,reviewed,protein%20names,genes,organism,length' \
                         '&fil=organism:%22Homo%20sapiens%20(Human)%20[9606]%22%20AND%20reviewed:yes' \
                         '&compress=yes'

            uniprot_db = pd.read_csv(source_url, sep='\t', compression='gzip')
            print('done')

        except Exception as e:
            print('Error fetching remote UniProt data, fetching local data')
            uniprot_db = pd.read_csv(os.path.join(data_dir,
                                                  'sources/uniprot.tab'),
                                     sep='\t')
            print('read local uniprot file')
    else:
        uniprot_db = utils.read_data_table_from_file(
            os.path.join(data_dir, 'sources/uniprot.tab'))
        print('read local uniprot file')

    ensembl_columns = {
        'Gene name': 'gene_name',
        'Gene stable ID': 'ensembl',
        'HGNC symbol': 'hgnc_symbol',
        'UniProtKB/Swiss-Prot ID': 'uniprot'
    }

    uniprot_columns = {'Entry': 'uniprot', 'Gene names': 'gene_names'}

    result_columns = ['gene_name', 'uniprot', 'hgnc_symbol', 'ensembl']

    ensembl_db = ensembl_db[list(
        ensembl_columns.keys())].rename(columns=ensembl_columns)
    uniprot_db = uniprot_db[list(
        uniprot_columns.keys())].rename(columns=uniprot_columns)
    hla_genes = utils.read_data_table_from_file(
        os.path.join(data_dir, 'sources/hla_curated.csv'))
    if user_gene:
        separator = _get_separator(os.path.splitext(user_gene)[-1])
        user_gene = pd.read_csv(user_gene, sep=separator)

    cpdb_genes = gene_generator(ensembl_db, uniprot_db, hla_genes, user_gene,
                                result_columns)

    cpdb_genes[result_columns].to_csv('{}/{}'.format(output_path,
                                                     'gene_generated.csv'),
                                      index=False)
Example #4
0
def generate_proteins(user_protein: Optional[str], fetch_uniprot: bool,
                      result_path: str, log_file: str, project_name: str):
    uniprot_columns = {
        'Entry': 'uniprot',
        'Entry name': 'protein_name',
    }

    # additional data comes from given file or uniprot remote url
    if fetch_uniprot:
        try:
            source_url = 'https://www.uniprot.org/uniprot/?query=*&format=tab&force=true' \
                         '&columns=id,entry%20name,reviewed,protein%20names,genes,organism,length' \
                         '&fil=organism:%22Homo%20sapiens%20(Human)%20[9606]%22%20AND%20reviewed:yes' \
                         '&compress=yes'

            uniprot_db = pd.read_csv(source_url, sep='\t', compression='gzip')

            print('read remote uniprot file')
        except Exception as e:
            print('Error fetching remote UniProt data, fetching local data')
            uniprot_db = pd.read_csv(os.path.join(data_dir,
                                                  'sources/uniprot.tab'),
                                     sep='\t')
            print('read local uniprot file')
    else:
        uniprot_db = pd.read_csv(os.path.join(data_dir, 'sources/uniprot.tab'),
                                 sep='\t')
        print('read local uniprot file')

    default_values = {
        'transmembrane': False,
        'peripheral': False,
        'secreted': False,
        'secreted_desc': pd.np.nan,
        'secreted_highlight': False,
        'receptor': False,
        'receptor_desc': pd.np.nan,
        'integrin': False,
        'other': False,
        'other_desc': pd.np.nan,
        'tags': 'To_add',
        'tags_reason': pd.np.nan,
        'tags_description': pd.np.nan,
        'pfam': pd.np.nan,
    }

    default_types = {
        'uniprot': str,
        'protein_name': str,
        'transmembrane': bool,
        'peripheral': bool,
        'secreted': bool,
        'secreted_desc': str,
        'secreted_highlight': bool,
        'receptor': bool,
        'receptor_desc': str,
        'integrin': bool,
        'other': bool,
        'other_desc': str,
        'tags': str,
        'tags_reason': str,
        'tags_description': str,
        'pfam': str,
    }

    result_columns = list(default_types.keys())

    output_path = _set_paths(result_path, project_name)
    log_path = '{}/{}'.format(output_path, log_file)
    uniprot_db = uniprot_db[list(
        uniprot_columns.keys())].rename(columns=uniprot_columns)
    curated_proteins = pd.read_csv(
        os.path.join(data_dir, 'sources/protein_curated.csv'))
    if user_protein:
        separator = _get_separator(os.path.splitext(user_protein)[-1])
        user_protein = pd.read_csv(user_protein, sep=separator)

    result = protein_generator(uniprot_db, curated_proteins, user_protein,
                               default_values, default_types, result_columns,
                               log_path)

    result[result_columns].to_csv('{}/{}'.format(output_path,
                                                 'protein_generated.csv'),
                                  index=False)
Example #5
0
def generate_interactions(
    proteins: str,
    genes: str,
    complex: str,
    user_interactions: Optional[str],
    user_interactions_only: bool,
    result_path: str,
    fetch_imex: bool,
    fetch_iuphar: bool,
    project_name: str,
) -> None:
    if user_interactions_only and not user_interactions:
        raise Exception('You need to set --user-interactions parameter')

    output_path = utils.set_paths(result_path, project_name)
    downloads_path = utils.set_paths(
        utils.set_paths(result_path, project_name), 'downloads')

    proteins = utils.read_data_table_from_file(proteins)
    genes = utils.read_data_table_from_file(genes)
    complexes = utils.read_data_table_from_file(complex)

    if not user_interactions_only:
        raw_imex = get_imex.call(genes, downloads_path, fetch_imex)

        interactions_to_remove = utils.read_data_table_from_file(
            os.path.join(data_dir, 'sources/excluded_interaction.csv'))
        interaction_curated = utils.read_data_table_from_file(
            os.path.join(data_dir, 'sources/interaction_curated.csv'))

    if user_interactions:
        separator = _get_separator(os.path.splitext(user_interactions)[-1])
        user_interactions = pd.read_csv(user_interactions, sep=separator)
        user_interactions['partner_a'] = user_interactions['partner_a'].apply(
            lambda x: str(x).strip())
        user_interactions['partner_b'] = user_interactions['partner_b'].apply(
            lambda x: str(x).strip())
        user_interactions['annotation_strategy'] = 'user_curated'

        if not 'protein_name_a' in user_interactions.columns:
            user_interactions['protein_name_a'] = ''

        if not 'protein_name_b' in user_interactions.columns:
            user_interactions['protein_name_b'] = ''

    result_columns = [
        'partner_a', 'partner_b', 'protein_name_a', 'protein_name_b',
        'annotation_strategy', 'source'
    ]
    if not user_interactions_only:
        print('Parsing IMEX file')
        imex_interactions = parse_interactions_imex(raw_imex, proteins, genes)

        print('Getting iuphar data')
        raw_iuphar = get_iuphar.call(downloads_path, fetch_iuphar)

        print('Generating iuphar interactions')
        iuphar_interactions = parse_iuphar_guidetopharmacology.call(
            raw_iuphar, genes, proteins)

        print('Merging iuphar/imex')
        merged_interactions = merge_iuphar_imex_interactions(
            iuphar_interactions, imex_interactions)

        print('Removing complex interactions')
        no_complex_interactions = only_noncomplex_interactions(
            merged_interactions, complexes)

        print('Removing selected interactions')
        clean_interactions = remove_interactions_in_file(
            no_complex_interactions, interactions_to_remove)

        print('Adding curated interaction')
        interactions_with_curated = add_curated(clean_interactions,
                                                interaction_curated)

        result = tools_helper.normalize_interactions(
            interactions_with_curated.append(user_interactions,
                                             ignore_index=True,
                                             sort=False), 'partner_a',
            'partner_b').drop_duplicates(['partner_a', 'partner_b'],
                                         keep='last')

    else:
        result = tools_helper.normalize_interactions(user_interactions, 'partner_a', 'partner_b') \
            .drop_duplicates(['partner_a', 'partner_b'], keep='last')

    result[result_columns].sort_values(['partner_a', 'partner_b']).to_csv(
        '{}/interaction_input.csv'.format(output_path), index=False)