def __init__(self, genes, resource_manager=None): self.genes = genes self.graph = nx.MultiGraph() if not resource_manager: self.resource_manager = ResourceManager() else: self.resource_manager = resource_manager self.go_dag = None self.goa = None
def __init__(self, genes, resource_manager=None): self.genes = genes self.graph = nx.MultiGraph() if not resource_manager: self.resource_manager = ResourceManager() else: self.resource_manager = resource_manager self.go_dag = GODag(self.resource_manager.get_go_obo()) self.goa = self._load_goa_gaf()
def test_read_gene_list_entrez_mouse(): rm = ResourceManager(base_folder=default_base_folder) with open('test_gene_list_entrez_mouse.txt', 'w') as fh: fh.write('14433') refs = read_gene_list('test_gene_list_entrez_mouse.txt', 'entrez_mouse', rm) assert len(refs) == 1 assert refs[0]['MGI'] == '95640' assert refs[0]['HGNC_SYMBOL'] == 'GAPDH'
def test_read_gene_list_rgd(): rm = ResourceManager(base_folder=default_base_folder) with open('test_gene_list_rgd.txt', 'w') as fh: fh.write('2561\n') fh.write('RGD:69323') refs = read_gene_list('test_gene_list_rgd.txt', 'rgd_id', rm) assert len(refs) == 2, refs assert refs[0]['RGD'] == '2561' assert refs[0]['HGNC_SYMBOL'] == 'ERBB2' assert refs[1]['RGD'] == '69323' assert refs[1]['HGNC_SYMBOL'] == 'ERBB3'
def run_main(args): # Now we run the relevant stage of processing project_folder = create_folder(args.base_folder, args.project) # Add a logger specific to the project and processing stage log_file = os.path.join(project_folder, 'genewalk_%s.log' % args.stage) formatter = logging.Formatter(default_logger_format, datefmt=default_date_format) project_log_handler = logging.FileHandler(log_file) project_log_handler.setFormatter(formatter) root_logger.addHandler(project_log_handler) if args.random_seed: logger.info('Running with random seed %d' % args.random_seed) random.seed(a=int(args.random_seed)) # Make sure we have all the resource files rm = ResourceManager(base_folder=args.base_folder) rm.download_all() if args.stage in ('all', 'node_vectors'): genes = read_gene_list(args.genes, args.id_type, rm) save_pickle(genes, project_folder, 'genes') MG = load_network(args.network_source, args.network_file, genes, resource_manager=rm) save_pickle(MG.graph, project_folder, 'multi_graph') for i in range(args.nreps_graph): logger.info('%s/%s' % (i + 1, args.nreps_graph)) DW = run_walks(MG.graph, workers=args.nproc, size=args.dim_rep) # Pickle the node vectors (embeddings) and DW object if args.save_dw: save_pickle(DW, project_folder, 'deepwalk_%d' % (i + 1)) nv = copy.deepcopy(DW.model.wv) save_pickle(nv, project_folder, 'deepwalk_node_vectors_%d' % (i + 1)) # Delete the DeepWalk object to clear memory del DW, nv gc.collect() if args.stage in ('all', 'null_distribution'): MG = load_pickle(project_folder, 'multi_graph') srd = [] for i in range(args.nreps_null): logger.info('%s/%s' % (i + 1, args.nreps_null)) RG = get_rand_graph(MG) DW = run_walks(RG, workers=args.nproc, size=args.dim_rep) # Pickle the node vectors (embeddings) and DW object if args.save_dw: save_pickle(DW, project_folder, 'deepwalk_rand_%d' % (i + 1)) nv = copy.deepcopy(DW.model.wv) save_pickle(nv, project_folder, 'deepwalk_node_vectors_rand_%d' % (i + 1)) # Delete the DeepWalk object to clear memory del DW gc.collect() # Calculate the null distributions srd += get_null_distributions(RG, nv) del nv gc.collect() srd = np.asarray(sorted(srd)) save_pickle(srd, project_folder, 'genewalk_rand_simdists') if args.stage in ('all', 'statistics'): MG = load_pickle(project_folder, 'multi_graph') genes = load_pickle(project_folder, 'genes') nvs = [load_pickle(project_folder, 'deepwalk_node_vectors_%d' % (i + 1)) for i in range(args.nreps_graph)] null_dist = load_pickle(project_folder, 'genewalk_rand_simdists') GW = GeneWalk(MG, genes, nvs, null_dist) df = GW.generate_output(alpha_fdr=args.alpha_fdr, base_id_type=args.id_type) fname = os.path.join(project_folder, 'genewalk_results.csv') logger.info('Saving final results into %s' % fname) df.to_csv(fname, index=False, float_format='%.3e') if args.stage in ('all', 'visual'): fname = os.path.join(project_folder, 'genewalk_results.csv') dGW = pd.read_csv(fname) figure_folder = create_folder(project_folder, 'figures') create_folder(figure_folder, 'barplots') GWp = GW_Plotter(figure_folder, dGW, args.alpha_fdr) GWp.generate_plots()
class NxMgAssembler(object): """Class which assembles a networkx MultiGraph based on a list of genes. Parameters ---------- genes : list of dict A list of gene references based on which the graph is assembled. Attributes ---------- graph : networkx.MultiGraph The assembled graph containing links for interactions between genes, GO annotations for genes, and the GO ontology. """ def __init__(self, genes, resource_manager=None): self.genes = genes self.graph = nx.MultiGraph() if not resource_manager: self.resource_manager = ResourceManager() else: self.resource_manager = resource_manager self.go_dag = GODag(self.resource_manager.get_go_obo()) self.goa = self._load_goa_gaf() def _get_go_terms_for_gene(self, gene): # Filter to rows with the given gene's UniProt ID if ('UP' not in gene) or ('HGNC_SYMBOL' not in gene): return [] elif gene['HGNC_SYMBOL'] not in self.graph: return [] df = self.goa[self.goa['DB_ID'] == gene['UP']] go_ids = sorted(list(set(df['GO_ID']))) return go_ids def add_go_annotations(self): """Add edges between gene nodes and GO nodes based on GO annotations.""" logger.info('Adding GO annotations for genes in graph.') for gene in self.genes: go_ids = self._get_go_terms_for_gene(gene) for go_id in go_ids: if go_id in self.go_dag: go_term = self.go_dag[go_id] if go_term.is_obsolete: continue self.graph.add_node(go_term.id, name=go_term.name, GO=go_term.id, domain=go_term.namespace) self.graph.add_edge(gene['HGNC_SYMBOL'], go_term.id, label='GO:annotation') def add_go_ontology(self): """Add edges between GO nodes based on the GO ontology.""" logger.info('Adding GO ontology edges to graph.') for go_term in list(self.go_dag.values()): if go_term.is_obsolete: continue self.graph.add_node(go_term.id, name=go_term.name, GO=go_term.id, domain=go_term.namespace) for parent_term in go_term.parents: if parent_term.is_obsolete: continue self.graph.add_node(go_term.id, name=go_term.name, GO=go_term.id, domain=go_term.namespace) self.graph.add_edge(go_term.id, parent_term.id, label='GO:is_a') def node2edges(self, node_key): """Return the edges corresponding to a node.""" return self.graph.edges(node_key, keys=True) def save_graph(self, fname): """Save the file into a GraphML file. Parameters ---------- fname : str The name of the file to save the graph into. """ nx.write_graphml(self.graph, fname) def _load_goa_gaf(self): """Load the gene/GO annotations as a pandas data frame.""" goa_ec = {'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP', 'HTP', 'HDA', 'HMP', 'HGI', 'HEP', 'IBA', 'IBD'} goa = pd.read_csv(self.resource_manager.get_goa_gaf(), sep='\t', skiprows=23, dtype=str, header=None, names=['DB', 'DB_ID', 'DB_Symbol', 'Qualifier', 'GO_ID', 'DB_Reference', 'Evidence_Code', 'With_From', 'Aspect', 'DB_Object_Name', 'DB_Object_Synonym', 'DB_Object_Type', 'Taxon', 'Date', 'Assigned', 'Annotation_Extension', 'Gene_Product_Form_ID']) goa = goa.sort_values(by=['DB_ID', 'GO_ID']) # Filter out all "NOT" negative evidences goa['Qualifier'].fillna('', inplace=True) goa = goa[~goa['Qualifier'].str.startswith('NOT')] # Filter to rows with evidence code corresponding to experimental # evidence goa = goa[goa['Evidence_Code'].isin(goa_ec)] return goa
def test_read_custom_list(): rm = ResourceManager(base_folder=default_base_folder) gene_list_file = os.path.join(TEST_RESOURCES, 'custom_gene_list.txt') refs = read_gene_list(gene_list_file, 'custom', rm) assert len(refs) == 3, refs assert refs[0] == {'ID': 'CUSTOM:ABC'}, refs
import os from nose.tools import raises from genewalk.gene_lists import * from genewalk.cli import default_base_folder from genewalk.resources import ResourceManager from .util import TEST_RESOURCES rm = ResourceManager() gm = GeneMapper(rm) def test_map_lists(): refs = map_hgnc_symbols(['BRAF', 'KRAS'], gm) assert refs[0]['HGNC'] == '1097', refs assert refs[0]['UP'] == 'P15056', refs assert refs[0]['HGNC_SYMBOL'] == 'BRAF', refs assert refs[1]['HGNC'] == '6407', refs assert refs[1]['UP'] == 'P01116', refs assert refs[1]['HGNC_SYMBOL'] == 'KRAS', refs refs = map_hgnc_ids(['1097', '6407'], gm) assert refs[0]['HGNC'] == '1097', refs assert refs[0]['UP'] == 'P15056', refs assert refs[0]['HGNC_SYMBOL'] == 'BRAF', refs assert refs[1]['HGNC'] == '6407', refs assert refs[1]['UP'] == 'P01116', refs assert refs[1]['HGNC_SYMBOL'] == 'KRAS', refs refs = map_mgi_ids(['MGI:892970'], gm) assert refs[0]['HGNC'] == '6817', refs assert refs[0]['HGNC_SYMBOL'] == 'MAL', refs
import os from genewalk.gene_lists import read_gene_list from genewalk.nx_mg_assembler import UserNxMgAssembler from genewalk.resources import ResourceManager from .util import TEST_RESOURCES, TEST_BASE_FOLDER rm = ResourceManager(TEST_BASE_FOLDER) sif_genes = os.path.join(TEST_RESOURCES, 'test_sif.sif') sif_annots = os.path.join(TEST_RESOURCES, 'test_sif_annot.sif') sif_full = os.path.join(TEST_RESOURCES, 'test_sif_full.sif') genes = read_gene_list(os.path.join(TEST_RESOURCES, 'hgnc_symbols.txt'), id_type='hgnc_symbol', resource_manager=rm) def test_gene_only_sif(): mga = UserNxMgAssembler(genes, resource_manager=rm, filepath=sif_genes, gwn_format='sif') gene_nodes = {'KRAS', 'BRAF', 'MAP2K2', 'MAPK1', 'PIK3CA', 'AKT1'} go_nodes = {'GO:0001934', 'GO:0005515', 'GO:0000186', 'GO:0000001', 'GO:0032147', 'GO:0003924'} assert set(mga.graph.nodes()) == gene_nodes | go_nodes # Make sure we have GO node annotations as expected go_node = mga.graph.nodes['GO:0000186'] assert go_node['GO'] == 'GO:0000186', go_node assert go_node['domain'] == 'biological_process', go_node assert go_node['name'] == 'activation of MAPKK activity', go_node # Make sure we have GO annotation edges assert ('BRAF', 'GO:0000186') in mga.graph.edges
def main(): parser = argparse.ArgumentParser( description='Run GeneWalk on a list of genes provided in a text ' 'file.') parser.add_argument('--version', action='version', version='GeneWalk %s' % __version__, help='Print the version of GeneWalk and exit.') parser.add_argument('--project', help='A name for the project which ' 'determines the folder within the ' 'base folder in which the ' 'intermediate and final results ' 'are written. Must contain only ' 'characters that are valid in ' 'folder names.', required=True) parser.add_argument('--genes', help='Path to a text file with a list of ' 'genes of interest, for example' 'differentially expressed genes. ' 'The type of gene identifiers used in ' 'the text file are provided in the ' 'id_type argument.', required=True) parser.add_argument('--id_type', help='The type of gene IDs provided in the text file ' 'in the genes argument. Possible values are: ' 'hgnc_symbol, hgnc_id, ensembl_id, and mgi_id.', choices=[ 'hgnc_symbol', 'hgnc_id', 'ensembl_id', 'mgi_id', 'entrez_human', 'entrez_mouse' ], required=True) parser.add_argument( '--stage', default='all', help='The stage of processing to run. Default: ' '%(default)s', choices=['all', 'node_vectors', 'null_distribution', 'statistics']) parser.add_argument('--base_folder', default=default_base_folder, help='The base folder used to store GeneWalk ' 'temporary and result files for a given project.' ' Default: %(default)s') parser.add_argument('--network_source', default='pc', help='The source of the network to be used.' 'Possible values are: pc, indra, edge_list, and ' 'sif. In case of indra, edge_list, and sif, ' 'the network_file argument must be specified.' ' Default: %(default)s', choices=['pc', 'indra', 'edge_list', 'sif']) parser.add_argument('--network_file', default=None, help='If network_source is indra, this argument ' 'points to a Python pickle file in which a list ' 'of INDRA Statements constituting the network ' 'is contained. In case network_source is ' 'edge_list or sif, ' 'the network_file argument points to a text file ' 'representing the network.') parser.add_argument('--nproc', default=1, type=int, help='The number of processors to use in a ' 'multiprocessing environment. Default: ' '%(default)s') parser.add_argument('--nreps_graph', default=3, type=int, help='The number of repeats to run when calculating ' 'node vectors on the GeneWalk graph. ' 'Default: %(default)s') parser.add_argument('--nreps_null', default=3, type=int, help='The number of repeats to run when calculating ' 'node vectors on the random network graphs ' 'for constructing the null distribution. ' 'Default: %(default)s') parser.add_argument('--alpha_fdr', default=1, type=float, help='The false discovery rate to use when ' 'outputting the final statistics table. ' 'If 1 (default), all similarities are output, ' 'otherwise only the ones whose false discovery ' 'rate are below this parameter are included. ' 'Default: %(default)s') parser.add_argument('--save_dw', default=False, type=bool, help='If True, the full DeepWalk object for each ' 'repeat is saved in the project folder. This can ' 'be useful for debugging but the files are ' 'typically very large. Default: %(default)s') parser.add_argument('--random_seed', default=None, type=int, help='If provided, the random number generator is ' 'seeded with the given value. This should only ' 'be used if the goal is to deterministically ' 'reproduce a prior result obtained with the same ' 'random seed.') args = parser.parse_args() # Now we run the relevant stage of processing project_folder = create_project_folder(args.base_folder, args.project) # Add a logger specific to the project and processing stage log_file = os.path.join(project_folder, 'genewalk_%s.log' % args.stage) formatter = logging.Formatter(default_logger_format, datefmt=default_date_format) project_log_handler = logging.FileHandler(log_file) project_log_handler.setFormatter(formatter) root_logger.addHandler(project_log_handler) if args.random_seed: logger.info('Running with random seed %d' % args.random_seed) random.seed(a=int(args.random_seed)) # Make sure we have all the resource files rm = ResourceManager(base_folder=args.base_folder) rm.download_all() if args.stage in ('all', 'node_vectors'): genes = read_gene_list(args.genes, args.id_type, rm) save_pickle(genes, project_folder, 'genes') MG = load_network(args.network_source, args.network_file, genes, resource_manager=rm) save_pickle(MG.graph, project_folder, 'multi_graph') for i in range(args.nreps_graph): logger.info('%s/%s' % (i + 1, args.nreps_graph)) DW = run_walks(MG.graph, workers=args.nproc) # Pickle the node vectors (embeddings) and DW object if args.save_dw: save_pickle(DW, project_folder, 'deepwalk_%d' % (i + 1)) nv = copy.deepcopy(DW.model.wv) save_pickle(nv, project_folder, 'deepwalk_node_vectors_%d' % (i + 1)) # Delete the DeepWalk object to clear memory del DW, nv gc.collect() if args.stage in ('all', 'null_distribution'): MG = load_pickle(project_folder, 'multi_graph') srd = [] for i in range(args.nreps_null): logger.info('%s/%s' % (i + 1, args.nreps_null)) RG = get_rand_graph(MG) DW = run_walks(RG, workers=args.nproc) # Pickle the node vectors (embeddings) and DW object if args.save_dw: save_pickle(DW, project_folder, 'deepwalk_rand_%d' % (i + 1)) nv = copy.deepcopy(DW.model.wv) save_pickle(nv, project_folder, 'deepwalk_node_vectors_rand_%d' % (i + 1)) # Delete the DeepWalk object to clear memory del DW gc.collect() # Calculate the null distributions srd += get_null_distributions(RG, nv) del nv gc.collect() srd = np.asarray(sorted(srd)) save_pickle(srd, project_folder, 'genewalk_rand_simdists') if args.stage in ('all', 'statistics'): MG = load_pickle(project_folder, 'multi_graph') genes = load_pickle(project_folder, 'genes') nvs = [ load_pickle(project_folder, 'deepwalk_node_vectors_%d' % (i + 1)) for i in range(args.nreps_graph) ] null_dist = load_pickle(project_folder, 'genewalk_rand_simdists') GW = GeneWalk(MG, genes, nvs, null_dist) df = GW.generate_output(alpha_fdr=args.alpha_fdr, base_id_type=args.id_type) fname = os.path.join(project_folder, 'genewalk_results.csv') logger.info('Saving final results into %s' % fname) df.to_csv(fname, index=False, float_format='%.3e')