def dot_plot( robjects, r_runtime_error: Exception, means_path: str, pvalues_path: str, output_path: str, output_name: str, rows: Optional[str] = None, columns: Optional[str] = None, ) -> None: pvalues_separator = _get_separator(os.path.splitext(pvalues_path)[-1]) means_separator = _get_separator(os.path.splitext(means_path)[-1]) output_extension = os.path.splitext(output_name)[-1].lower() filename = os.path.join(output_path, output_name) means_df = pd.read_csv(means_path, sep=means_separator) n_rows, n_cols = means_df.shape n_cols -= 11 n_rows, selected_rows = selected_items(rows, n_rows) n_cols, selected_columns = selected_items(columns, n_cols) this_file_dir = os.path.dirname(os.path.realpath(__file__)) robjects.r.source( os.path.join(this_file_dir, 'R/plot_dot_by_column_name.R')) available_names = list(robjects.globalenv.keys()) plot_function = 'dot_plot' if plot_function in available_names: function_name = plot_function else: raise MissingPlotterFunctionException() plotter = robjects.r[function_name] try: plotter(selected_rows=selected_rows, selected_columns=selected_columns, filename=filename, width=int(5 + max(3, n_cols * 0.8)), height=int(5 + max(5, n_rows * 0.5)), means_path=means_path, pvalues_path=pvalues_path, means_separator=means_separator, pvalues_separator=pvalues_separator, output_extension=output_extension) except r_runtime_error as e: raise RRuntimeException(e)
def generate_complex(user_complex: Optional[str], result_path: str, log_file: str): output_path = _set_paths(output_dir, result_path) log_path = '{}/{}'.format(output_path, log_file) curated_complex = pd.read_csv(os.path.join(data_dir, 'sources/complex_curated.csv')) if user_complex: separator = _get_separator(os.path.splitext(user_complex)[-1]) user_complex = pd.read_csv(user_complex, sep=separator) result = complex_generator(curated_complex, user_complex, log_path) result.to_csv('{}/{}'.format(output_path, 'complex_generated.csv'), index=False)
def generate_genes( user_gene: Optional[str], fetch_uniprot: bool, fetch_ensembl: bool, result_path: str, project_name: str, ) -> None: output_path = _set_paths(result_path, project_name) # TODO: Add logger if fetch_ensembl: print('fetching remote ensembl data ... ', end='') source_url = 'http://www.ensembl.org/biomart/martservice?query={}' query = '<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" ' \ 'formatter = "CSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >' \ '<Dataset name = "hsapiens_gene_ensembl" interface = "default" >' \ '<Attribute name = "ensembl_gene_id" />' \ '<Attribute name = "ensembl_transcript_id" />' \ '<Attribute name = "external_gene_name" />' \ '<Attribute name = "hgnc_symbol" />' \ '<Attribute name = "uniprotswissprot" />' \ '</Dataset>' \ '</Query>' url = source_url.format(urllib.parse.quote(query)) ensembl_db = pd.read_csv(url) print('done') else: ensembl_db = utils.read_data_table_from_file( os.path.join(data_dir, 'sources/ensembl.txt')) print('read local ensembl file') # additional data comes from given file or uniprot remote url if fetch_uniprot: try: print('fetching remote uniprot file ... ', end='') source_url = 'https://www.uniprot.org/uniprot/?query=*&format=tab&force=true' \ '&columns=id,entry%20name,reviewed,protein%20names,genes,organism,length' \ '&fil=organism:%22Homo%20sapiens%20(Human)%20[9606]%22%20AND%20reviewed:yes' \ '&compress=yes' uniprot_db = pd.read_csv(source_url, sep='\t', compression='gzip') print('done') except Exception as e: print('Error fetching remote UniProt data, fetching local data') uniprot_db = pd.read_csv(os.path.join(data_dir, 'sources/uniprot.tab'), sep='\t') print('read local uniprot file') else: uniprot_db = utils.read_data_table_from_file( os.path.join(data_dir, 'sources/uniprot.tab')) print('read local uniprot file') ensembl_columns = { 'Gene name': 'gene_name', 'Gene stable ID': 'ensembl', 'HGNC symbol': 'hgnc_symbol', 'UniProtKB/Swiss-Prot ID': 'uniprot' } uniprot_columns = {'Entry': 'uniprot', 'Gene names': 'gene_names'} result_columns = ['gene_name', 'uniprot', 'hgnc_symbol', 'ensembl'] ensembl_db = ensembl_db[list( ensembl_columns.keys())].rename(columns=ensembl_columns) uniprot_db = uniprot_db[list( uniprot_columns.keys())].rename(columns=uniprot_columns) hla_genes = utils.read_data_table_from_file( os.path.join(data_dir, 'sources/hla_curated.csv')) if user_gene: separator = _get_separator(os.path.splitext(user_gene)[-1]) user_gene = pd.read_csv(user_gene, sep=separator) cpdb_genes = gene_generator(ensembl_db, uniprot_db, hla_genes, user_gene, result_columns) cpdb_genes[result_columns].to_csv('{}/{}'.format(output_path, 'gene_generated.csv'), index=False)
def generate_proteins(user_protein: Optional[str], fetch_uniprot: bool, result_path: str, log_file: str, project_name: str): uniprot_columns = { 'Entry': 'uniprot', 'Entry name': 'protein_name', } # additional data comes from given file or uniprot remote url if fetch_uniprot: try: source_url = 'https://www.uniprot.org/uniprot/?query=*&format=tab&force=true' \ '&columns=id,entry%20name,reviewed,protein%20names,genes,organism,length' \ '&fil=organism:%22Homo%20sapiens%20(Human)%20[9606]%22%20AND%20reviewed:yes' \ '&compress=yes' uniprot_db = pd.read_csv(source_url, sep='\t', compression='gzip') print('read remote uniprot file') except Exception as e: print('Error fetching remote UniProt data, fetching local data') uniprot_db = pd.read_csv(os.path.join(data_dir, 'sources/uniprot.tab'), sep='\t') print('read local uniprot file') else: uniprot_db = pd.read_csv(os.path.join(data_dir, 'sources/uniprot.tab'), sep='\t') print('read local uniprot file') default_values = { 'transmembrane': False, 'peripheral': False, 'secreted': False, 'secreted_desc': pd.np.nan, 'secreted_highlight': False, 'receptor': False, 'receptor_desc': pd.np.nan, 'integrin': False, 'other': False, 'other_desc': pd.np.nan, 'tags': 'To_add', 'tags_reason': pd.np.nan, 'tags_description': pd.np.nan, 'pfam': pd.np.nan, } default_types = { 'uniprot': str, 'protein_name': str, 'transmembrane': bool, 'peripheral': bool, 'secreted': bool, 'secreted_desc': str, 'secreted_highlight': bool, 'receptor': bool, 'receptor_desc': str, 'integrin': bool, 'other': bool, 'other_desc': str, 'tags': str, 'tags_reason': str, 'tags_description': str, 'pfam': str, } result_columns = list(default_types.keys()) output_path = _set_paths(result_path, project_name) log_path = '{}/{}'.format(output_path, log_file) uniprot_db = uniprot_db[list( uniprot_columns.keys())].rename(columns=uniprot_columns) curated_proteins = pd.read_csv( os.path.join(data_dir, 'sources/protein_curated.csv')) if user_protein: separator = _get_separator(os.path.splitext(user_protein)[-1]) user_protein = pd.read_csv(user_protein, sep=separator) result = protein_generator(uniprot_db, curated_proteins, user_protein, default_values, default_types, result_columns, log_path) result[result_columns].to_csv('{}/{}'.format(output_path, 'protein_generated.csv'), index=False)
def generate_interactions( proteins: str, genes: str, complex: str, user_interactions: Optional[str], user_interactions_only: bool, result_path: str, fetch_imex: bool, fetch_iuphar: bool, project_name: str, ) -> None: if user_interactions_only and not user_interactions: raise Exception('You need to set --user-interactions parameter') output_path = utils.set_paths(result_path, project_name) downloads_path = utils.set_paths( utils.set_paths(result_path, project_name), 'downloads') proteins = utils.read_data_table_from_file(proteins) genes = utils.read_data_table_from_file(genes) complexes = utils.read_data_table_from_file(complex) if not user_interactions_only: raw_imex = get_imex.call(genes, downloads_path, fetch_imex) interactions_to_remove = utils.read_data_table_from_file( os.path.join(data_dir, 'sources/excluded_interaction.csv')) interaction_curated = utils.read_data_table_from_file( os.path.join(data_dir, 'sources/interaction_curated.csv')) if user_interactions: separator = _get_separator(os.path.splitext(user_interactions)[-1]) user_interactions = pd.read_csv(user_interactions, sep=separator) user_interactions['partner_a'] = user_interactions['partner_a'].apply( lambda x: str(x).strip()) user_interactions['partner_b'] = user_interactions['partner_b'].apply( lambda x: str(x).strip()) user_interactions['annotation_strategy'] = 'user_curated' if not 'protein_name_a' in user_interactions.columns: user_interactions['protein_name_a'] = '' if not 'protein_name_b' in user_interactions.columns: user_interactions['protein_name_b'] = '' result_columns = [ 'partner_a', 'partner_b', 'protein_name_a', 'protein_name_b', 'annotation_strategy', 'source' ] if not user_interactions_only: print('Parsing IMEX file') imex_interactions = parse_interactions_imex(raw_imex, proteins, genes) print('Getting iuphar data') raw_iuphar = get_iuphar.call(downloads_path, fetch_iuphar) print('Generating iuphar interactions') iuphar_interactions = parse_iuphar_guidetopharmacology.call( raw_iuphar, genes, proteins) print('Merging iuphar/imex') merged_interactions = merge_iuphar_imex_interactions( iuphar_interactions, imex_interactions) print('Removing complex interactions') no_complex_interactions = only_noncomplex_interactions( merged_interactions, complexes) print('Removing selected interactions') clean_interactions = remove_interactions_in_file( no_complex_interactions, interactions_to_remove) print('Adding curated interaction') interactions_with_curated = add_curated(clean_interactions, interaction_curated) result = tools_helper.normalize_interactions( interactions_with_curated.append(user_interactions, ignore_index=True, sort=False), 'partner_a', 'partner_b').drop_duplicates(['partner_a', 'partner_b'], keep='last') else: result = tools_helper.normalize_interactions(user_interactions, 'partner_a', 'partner_b') \ .drop_duplicates(['partner_a', 'partner_b'], keep='last') result[result_columns].sort_values(['partner_a', 'partner_b']).to_csv( '{}/interaction_input.csv'.format(output_path), index=False)