def run(self): while True: if len(multiprocessing.active_children() ) < multiprocessing.cpu_count(): # look for the next pending job client = pymongo.MongoClient(app.mongodb_uri) job = client.nav.jobs.find_one({'status': 'pending'}) if job: # start the job as a new process set_status(job, 'processing') client.nav.jobs.save(job) try: process = multiprocessing.Process( name=self.process_name(job['_id']), target=run_job, args=(job, self.event)) process.start() log.info('processing job %s', job['_id']) except Exception as e: set_status(job, 'error') client.nav.jobs.save(job) log.error('could not start job %s (%s)', job['_id'], e.message) # wait for events like new, cancelled, or finished jobs self.event.wait() self.event.clear()
def cancel(self, id): client = pymongo.MongoClient(app.mongodb_uri) job = client.nav.jobs.find_one({'_id': id}) if job: log.info('cancelling job %s with status %s', id, job['status']) # look for job among active processes for process in multiprocessing.active_children(): if process.name == self.process_name(id): # send terminate signal to process and wait for it to finish log.info('terminating job %s', id) process.terminate() log.info('waiting for job %s to terminate', id) process.join() log.info('job %s terminated', id) # inform the job queue thread self.event.set() set_status(job, 'cancelled') client.nav.jobs.save(job) else: log.error('cancel job failed because job %s was not found', id) raise LookupError
def save_network(): client = pymongo.MongoClient(app.mongodb_uri) network = request.json network = deserialize(network) # convert string id to ObjectId set_status(network, 'updated') client.nav.networks.save(network) return {'timestamp': network['timestamp']['updated']}
def submit(self, project_id): client = pymongo.MongoClient(app.mongodb_uri) project = client.nav.projects.find_one({'_id': project_id}) if project: def plural(n, singular, plural=None): str = '{} '.format(n) if n == 1: return str + singular else: return str + (plural if plural is not None else singular + 's') name = plural(len(project['gene_list']), 'Gene') n = len(project.get('files', [])) if n > 0: name += ' + ' + plural(n, 'File') if project.get('include_neighbors'): name += ' + ' + (plural(project['n_hottest_neighbors'], 'Hottest Neighbor') if project.get('do_heat_diffusion') else plural( project['n_connected_neighbors'], 'Most Connected Neighbor')) # create new job job = {'project': project, 'name': name} set_status(job, 'pending') # add snapshot of successful file meta-data to job and reset status project['files'] = [ it for it in client.files.meta.find({ '_id': { '$in': [it['_id'] for it in project.get('files', [])] } }) if it['status'] == 'success' ] for file in project['files']: del file['status'] del file['timestamp'] job_id = client.nav.jobs.insert(job) log.info('submit project %s succeeded as job %s', project_id, job_id) # FIXME comment out the following line to run jobs in a separate thread (uncomment it to enable debug breakpoints in the job) #run_job(job, self.event) # inform the job queue thread self.event.set() else: log.error('submit project failed because project %s was not found', project_id) raise LookupError()
def update_project(): client = pymongo.MongoClient(app.mongodb_uri) project = request.json try: project = deserialize(project) # convert string id to ObjectId set_status(project, 'updated') client.nav.projects.save(project) return {'timestamp': project['timestamp']['updated']} except: return HTTPError(400)
def create_project(): client = pymongo.MongoClient(app.mongodb_uri) project = { 'gene_list': [], 'include_neighbors': True, 'n_connected_neighbors': 20, 'n_hottest_neighbors': 20, 'do_heat_diffusion': False } set_status(project, 'created') project['_id'] = str(client.nav.projects.insert(project)) return serialize(project)
def submit(self, project_id): client = pymongo.MongoClient(app.mongodb_uri) project = client.nav.projects.find_one({'_id': project_id}) if project: def plural(n, singular, plural=None): str = '{} '.format(n) if n == 1: return str + singular else: return str + (plural if plural is not None else singular + 's') name = plural(len(project['gene_list']), 'Gene') n = len(project.get('files', [])) if n > 0: name += ' + ' + plural(n, 'File') if project.get('include_neighbors'): name += ' + ' + (plural(project['n_hottest_neighbors'], 'Hottest Neighbor') if project.get('do_heat_diffusion') else plural(project['n_connected_neighbors'], 'Most Connected Neighbor')) # create new job job = { 'project': project, 'name': name } set_status(job, 'pending') # add snapshot of successful file meta-data to job and reset status project['files'] = [it for it in client.files.meta.find({'_id': {'$in': [it['_id'] for it in project.get('files', [])]}}) if it['status'] == 'success'] for file in project['files']: del file['status'] del file['timestamp'] job_id = client.nav.jobs.insert(job) log.info('submit project %s succeeded as job %s', project_id, job_id) # FIXME comment out the following line to run jobs in a separate thread (uncomment it to enable debug breakpoints in the job) #run_job(job, self.event) # inform the job queue thread self.event.set() else: log.error('submit project failed because project %s was not found', project_id) raise LookupError()
def run(self): while True: if len(multiprocessing.active_children()) < multiprocessing.cpu_count(): # look for the next pending job client = pymongo.MongoClient(app.mongodb_uri) job = client.nav.jobs.find_one({'status': 'pending'}) if job: # start the job as a new process set_status(job, 'processing') client.nav.jobs.save(job) try: process = multiprocessing.Process(name=self.process_name(job['_id']), target=run_job, args=(job, self.event)) process.start() log.info('processing job %s', job['_id']) except Exception as e: set_status(job, 'error') client.nav.jobs.save(job) log.error('could not start job %s (%s)', job['_id'], e.message) # wait for events like new, cancelled, or finished jobs self.event.wait() self.event.clear()
def run_job(job, event): ''' network analysis job, run as a separate process :param job: job object that contains a snapshot of the project :param event: used to inform parent thread of completion :return: all artifacts are saved to the database The job creates a network artifact, with values defined as follows: sources: { source1_id: source1, # file meta-data object source2_id: source2 } nodes: [ { 'id': id, source_id: {key: value, ...} # data objects contain key/value pairs of node attributes from the different sources }, ... ] edges: [ { 'source': source_id, 'target': target_id, 'source node': source_name, 'target node': target_name, 'type': type }, ... ] ''' job_query = {'_id': job['_id']} client = pymongo.MongoClient(app.mongodb_uri) project = job['project'] # create a network artifact # nodes, edges, and groups are added to this artifact as part of the job artifact = { 'project': job['project']['_id'], 'job': job['_id'], 'sources': {}, 'groups': { 'node': [], 'edge': [] } } # id_to_name = {} # for gene in project['gene_list']: # id = gene['Ensembl Gene ID'] # name = gene['Gene Name'] # id_to_name[id] = name query_genes = [str(gene['Entrez Gene ID']) for gene in project['gene_list'] if 'Entrez Gene ID' in gene] meta = {it['_id']: it for it in client.networks.meta.find()} meta_ids = [id.encode("ascii") for id, value in project['networks'].iteritems() if value] if project.get('include_neighbors', False): query_genes = set(query_genes) #clean set. # clean_query_genes = set() # for gene in query_genes: # clean_query_genes.add( str(gene) ) # query_genes = clean_query_genes client.nav.jobs.update(job_query, {'$set': {'status_message': 'finding genes in one-step neighborhood'}}) # find one-step neighborhood around query genes (ignoring duplicate edges) # originally, we found all edges using a single query {'$or': [{'source': {'$in': query_genes}, 'target': {'$in': query_genes}], 'meta': {'$in': meta_ids}} # however, the approach below seems to be faster, perhaps due to the distinct operation degrees = defaultdict(int) for gene in query_genes: for target in client.networks.edges.find({'source': gene, 'meta': {'$in': meta_ids}}).distinct('target'): degrees[target] += 1 for source in client.networks.edges.find({'target': gene, 'meta': {'$in': meta_ids}}).distinct('source'): degrees[source] += 1 n = project['n_connected_neighbors'] client.nav.jobs.update(job_query, {'$set': {'status_message': 'finding top {} neighbors'.format(n)}}) # remove query genes from the degrees dictionary degrees = {key: value for key, value in degrees.iteritems() if key not in query_genes} # sort by descending value neighbors = [gene for gene, _ in sorted(degrees.items(), key=itemgetter(1), reverse=True)] # take top n neighbors neighbors = neighbors[:n] # final gene set contains the query genes and top n neighbors nodes = list(query_genes | set(neighbors)) if project.get('do_heat_diffusion', False): # use heat diffusion algorithm to further refine the list of neighbors # find all edges within the one-step neighborhood client.nav.jobs.update(job_query, {'$set': {'status_message': 'finding edges for {} node one-step neighborhood'.format(len(nodes))}}) edges = set() for edge in client.networks.edges.find({'source': {'$in': nodes}, 'target': {'$in': nodes}, 'meta': {'$in': meta_ids}}): edges.add(tuple(sorted((edge['source'], edge['target'])))) client.nav.jobs.update(job_query, {'$set': {'status_message': 'calculating heat diffusion kernel for {} nodes and {} edges (this may take a while)'.format(len(nodes), len(edges))}}) # calculate heat diffusion kernel (influence matrix) # THIS TAKES A LONG TIME kernel = SciPYKernel(edges) client.nav.jobs.update(job_query, {'$set': {'status_message': 'calculating heat diffusion result'}}) weighted = 'heat_diffusion_weights' in project if weighted: try: # w = {'file': <file meta object>, 'header': {'key': <column key>, ...}} w = project['heat_diffusion_weights'] file = w['file'] id = str(file['_id']) key = w['header']['key'] # vector = {name: |value|, ...}, where name is untranslated # NOTE: we take the absolute value of the data because we can't have negative weights # this works well for values like fold change, where we're interested in either large positive or negative changes # any more sophisticated transforms should be provided in the input file vector = {it[file['headers'][0]['key']]: math.fabs(float(it[key])) for it in client.files[id].find()} # translate gene name to id name_to_id = genemania.id_lookup_table(vector.keys()) vector = {name_to_id[key]: value for key, value in vector.iteritems()} # reduce the vector to only the query genes vector = {key: value for key, value in vector.iteritems() if key in query_genes} # normalize weights total = sum(vector.values()) vector = {key: value / total for key, value in vector.iteritems()} # flag success if we made it this far weighted = True except KeyError: log.warning('weighted heat diffusion failed, reverting to uniform weights') if not weighted: # create uniform weighted heat vector (treats all query genes as equal point sources) vector = {gene: 1.0 / len(query_genes) for gene in query_genes} # calculate diffused heat metric = kernel.kernelMultiplyOne(vector) n = project.get('n_hottest_neighbors', 20) client.nav.jobs.update(job_query, {'$set': {'status_message': 'finding top {} neighbors'.format(n)}}) # remove query genes from the metric metric = {key: value for key, value in metric.iteritems() if key not in query_genes} # sort by descending value neighbors = [gene for gene, _ in sorted(metric.items(), key=itemgetter(1), reverse=True)] # take top n neighbors neighbors = neighbors[:n] # final gene set contains the query genes and top n neighbors nodes = list(query_genes | set(neighbors)) else: nodes = query_genes client.nav.jobs.update(job_query, {'$set': {'status_message': 'finding final edge list for {} genes'.format(len(nodes))}}) # final edge list query = { 'source': {'$in': nodes}, 'target': {'$in': nodes}, 'meta': {'$in': meta_ids} } # final edge list contains all edges in the final gene set edges = list(client.networks.edges.find(query, ['source', 'target', 'meta'])) client.nav.jobs.update(job_query, {'$set': {'status_message': 'calculating degree for {} genes from {} edges'.format(len(nodes), len(edges))}}) # calculate degrees based on final edge list (do not count duplicates) degrees = defaultdict(int) for source, target in set(tuple(sorted((edge['source'], edge['target']))) for edge in edges): degrees[source] += 1 degrees[target] += 1 # Note: This is no longer relevant. Delete soon. -- David Welker # id_to_name = genemania.name_lookup_table(nodes) id_to_name = {} for node in nodes: r = requests.get('http://mygene.info/v2/gene/'+str(node)+'?fields=symbol') id_to_name[node] = r.json()['symbol'] # create node objects def make_node(node): return { 'id': node, 'name': id_to_name[node], 'query': node in query_genes, 'degree': degrees[node] } # for convenience in merging data, nodes are initially stored as a dict keyed by id # it will be converted to a list of values later artifact['nodes'] = {node: make_node(node) for node in nodes} # create edge objects def make_edge(idx, edge): m = meta[edge['meta']] return { 'id': 'e{}'.format(idx), 'source': edge['source'], 'target': edge['target'], 'source name': id_to_name[edge['source']], 'target name': id_to_name[edge['target']], 'network collection': m.get('collection'), 'network type': m.get('type'), 'network source': m.get('source'), 'network name': m.get('name') } artifact['edges'] = [make_edge(idx, e) for idx, e in enumerate(edges)] # add data from uploaded files for file in project['files']: client.nav.jobs.update(job_query, {'$set': {'status_message': 'adding node data from {}'.format(file['name'])}}) source_id = str(file['_id']) artifact['sources'][source_id] = file # merge node data from file and add source info key = file['headers'][0]['key'] for data in client.files[str(file['_id'])].find(): id = genemania.lookup_id(data.pop(key)) if id in artifact['nodes']: data.pop('_id') for header in file['headers']: try: value = data[header['key']] except Exception: continue if header['datatype'] == 'numeric': value = to_numeric(value) elif header['datatype'] == 'boolean': value = to_boolean(value) data[header['key']] = value artifact['nodes'][id][source_id] = data set_status(file, 'success') client.nav.jobs.save(job) # convert nodes dictionaries to list of values artifact['nodes'] = artifact['nodes'].values() client.nav.jobs.update(job_query, {'$set': {'status_message': 'calculating gene set enrichment'}}) # add query genes to node_groups artifact['groups']['node'].append({ 'id': 'query', 'name': 'Query Genes', 'description': 'Set of {} gene{} in the user query.'.format(len(query_genes), '' if len(query_genes) == 1 else 's'), 'items': list(query_genes) }) # do gene set enrichment gene_list = [node['id'] for node in artifact['nodes']] enriched = go.gene_set_enrichment(gene_list)[:20] # get the top 20 go terms for it in enriched: artifact['groups']['node'].append({ 'id': it['go'].replace('GO:', 'go'), 'name': it['name'], 'description': it['def'], 'items': it['overlap'], 'count': it['n_genes'], 'pvalue': it['pvalue'], 'qvalue': it['qvalue'] }) # group edges by field def make_edge_groups(field): items = set(edge[field] for edge in artifact['edges'] if edge[field] is not None) for it in items: artifact['groups']['edge'].append({ 'name': it, 'items': [e['id'] for e in artifact['edges'] if e[field] == it] }) make_edge_groups('network collection') make_edge_groups('network type') set_status(artifact, 'created') id = client.nav.networks.insert(artifact) job['network'] = { '_id': id, 'nodes': len(artifact['nodes']), 'edges': len(artifact['edges']) } set_status(job, 'success') client.nav.jobs.save(job) # inform parent thread of job completion event.set()
def run_job(job, event): ''' network analysis job, run as a separate process :param job: job object that contains a snapshot of the project :param event: used to inform parent thread of completion :return: all artifacts are saved to the database The job creates a network artifact, with values defined as follows: sources: { source1_id: source1, # file meta-data object source2_id: source2 } nodes: [ { 'id': id, source_id: {key: value, ...} # data objects contain key/value pairs of node attributes from the different sources }, ... ] edges: [ { 'source': source_id, 'target': target_id, 'source node': source_name, 'target node': target_name, 'type': type }, ... ] ''' job_query = {'_id': job['_id']} client = pymongo.MongoClient(app.mongodb_uri) project = job['project'] # create a network artifact # nodes, edges, and groups are added to this artifact as part of the job artifact = { 'project': job['project']['_id'], 'job': job['_id'], 'sources': {}, 'groups': { 'node': [], 'edge': [] } } query_genes = [ gene['Ensembl Gene ID'] for gene in project['gene_list'] if 'Ensembl Gene ID' in gene ] meta = {it['_id']: it for it in client.networks.meta.find()} meta_ids = [ ObjectId(id) for id, value in project['networks'].iteritems() if value ] if project.get('include_neighbors', False): query_genes = set(query_genes) client.nav.jobs.update(job_query, { '$set': { 'status_message': 'finding genes in one-step neighborhood' } }) # find one-step neighborhood around query genes (ignoring duplicate edges) # originally, we found all edges using a single query {'$or': [{'source': {'$in': query_genes}, 'target': {'$in': query_genes}], 'meta': {'$in': meta_ids}} # however, the approach below seems to be faster, perhaps due to the distinct operation degrees = defaultdict(int) for gene in query_genes: for target in client.networks.edges.find({ 'source': gene, 'meta': { '$in': meta_ids } }).distinct('target'): degrees[target] += 1 for source in client.networks.edges.find({ 'target': gene, 'meta': { '$in': meta_ids } }).distinct('source'): degrees[source] += 1 n = project['n_connected_neighbors'] client.nav.jobs.update( job_query, {'$set': { 'status_message': 'finding top {} neighbors'.format(n) }}) # remove query genes from the degrees dictionary degrees = { key: value for key, value in degrees.iteritems() if key not in query_genes } # sort by descending value neighbors = [ gene for gene, _ in sorted( degrees.items(), key=itemgetter(1), reverse=True) ] # take top n neighbors neighbors = neighbors[:n] # final gene set contains the query genes and top n neighbors nodes = list(query_genes | set(neighbors)) if project.get('do_heat_diffusion', False): # use heat diffusion algorithm to further refine the list of neighbors # find all edges within the one-step neighborhood client.nav.jobs.update( job_query, { '$set': { 'status_message': 'finding edges for {} node one-step neighborhood'. format(len(nodes)) } }) edges = set() for edge in client.networks.edges.find({ 'source': { '$in': nodes }, 'target': { '$in': nodes }, 'meta': { '$in': meta_ids } }): edges.add(tuple(sorted((edge['source'], edge['target'])))) client.nav.jobs.update( job_query, { '$set': { 'status_message': 'calculating heat diffusion kernel for {} nodes and {} edges (this may take a while)' .format(len(nodes), len(edges)) } }) # calculate heat diffusion kernel (influence matrix) # THIS TAKES A LONG TIME kernel = SciPYKernel(edges) client.nav.jobs.update(job_query, { '$set': { 'status_message': 'calculating heat diffusion result' } }) weighted = 'heat_diffusion_weights' in project if weighted: try: # w = {'file': <file meta object>, 'header': {'key': <column key>, ...}} w = project['heat_diffusion_weights'] file = w['file'] id = str(file['_id']) key = w['header']['key'] # vector = {name: |value|, ...}, where name is untranslated # NOTE: we take the absolute value of the data because we can't have negative weights # this works well for values like fold change, where we're interested in either large positive or negative changes # any more sophisticated transforms should be provided in the input file vector = { it[file['headers'][0]['key']]: math.fabs(float(it[key])) for it in client.files[id].find() } # translate gene name to id name_to_id = genemania.id_lookup_table(vector.keys()) vector = { name_to_id[key]: value for key, value in vector.iteritems() } # reduce the vector to only the query genes vector = { key: value for key, value in vector.iteritems() if key in query_genes } # normalize weights total = sum(vector.values()) vector = { key: value / total for key, value in vector.iteritems() } # flag success if we made it this far weighted = True except KeyError: log.warning( 'weighted heat diffusion failed, reverting to uniform weights' ) if not weighted: # create uniform weighted heat vector (treats all query genes as equal point sources) vector = {gene: 1.0 / len(query_genes) for gene in query_genes} # calculate diffused heat metric = kernel.kernelMultiplyOne(vector) n = project.get('n_hottest_neighbors', 20) client.nav.jobs.update(job_query, { '$set': { 'status_message': 'finding top {} neighbors'.format(n) } }) # remove query genes from the metric metric = { key: value for key, value in metric.iteritems() if key not in query_genes } # sort by descending value neighbors = [ gene for gene, _ in sorted( metric.items(), key=itemgetter(1), reverse=True) ] # take top n neighbors neighbors = neighbors[:n] # final gene set contains the query genes and top n neighbors nodes = list(query_genes | set(neighbors)) else: nodes = query_genes client.nav.jobs.update( job_query, { '$set': { 'status_message': 'finding final edge list for {} genes'.format(len(nodes)) } }) # final edge list query = { 'source': { '$in': nodes }, 'target': { '$in': nodes }, 'meta': { '$in': meta_ids } } # final edge list contains all edges in the final gene set edges = list( client.networks.edges.find(query, ['source', 'target', 'meta'])) client.nav.jobs.update( job_query, { '$set': { 'status_message': 'calculating degree for {} genes from {} edges'.format( len(nodes), len(edges)) } }) # calculate degrees based on final edge list (do not count duplicates) degrees = defaultdict(int) for source, target in set( tuple(sorted((edge['source'], edge['target']))) for edge in edges): degrees[source] += 1 degrees[target] += 1 id_to_name = genemania.name_lookup_table(nodes) # create node objects def make_node(node): return { 'id': node, 'name': id_to_name[node], 'query': node in query_genes, 'degree': degrees[node] } # for convenience in merging data, nodes are initially stored as a dict keyed by id # it will be converted to a list of values later artifact['nodes'] = {node: make_node(node) for node in nodes} # create edge objects def make_edge(idx, edge): m = meta[edge['meta']] return { 'id': 'e{}'.format(idx), 'source': edge['source'], 'target': edge['target'], 'source name': id_to_name[edge['source']], 'target name': id_to_name[edge['target']], 'network collection': m.get('collection'), 'network type': m.get('type'), 'network source': m.get('source'), 'network name': m.get('name') } artifact['edges'] = [make_edge(idx, e) for idx, e in enumerate(edges)] # add data from uploaded files for file in project['files']: client.nav.jobs.update( job_query, { '$set': { 'status_message': 'adding node data from {}'.format(file['name']) } }) source_id = str(file['_id']) artifact['sources'][source_id] = file # merge node data from file and add source info key = file['headers'][0]['key'] for data in client.files[str(file['_id'])].find(): id = genemania.lookup_id(data.pop(key)) if id in artifact['nodes']: data.pop('_id') for header in file['headers']: try: value = data[header['key']] except Exception: continue if header['datatype'] == 'numeric': value = to_numeric(value) elif header['datatype'] == 'boolean': value = to_boolean(value) data[header['key']] = value artifact['nodes'][id][source_id] = data set_status(file, 'success') client.nav.jobs.save(job) # convert nodes dictionaries to list of values artifact['nodes'] = artifact['nodes'].values() client.nav.jobs.update( job_query, {'$set': { 'status_message': 'calculating gene set enrichment' }}) # add query genes to node_groups artifact['groups']['node'].append({ 'id': 'query', 'name': 'Query Genes', 'description': 'Set of {} gene{} in the user query.'.format( len(query_genes), '' if len(query_genes) == 1 else 's'), 'items': list(query_genes) }) # do gene set enrichment gene_list = [node['id'] for node in artifact['nodes']] enriched = go.gene_set_enrichment(gene_list)[: 20] # get the top 20 go terms for it in enriched: artifact['groups']['node'].append({ 'id': it['go'].replace('GO:', 'go'), 'name': it['name'], 'description': it['def'], 'items': it['overlap'], 'count': it['n_genes'], 'pvalue': it['pvalue'], 'qvalue': it['qvalue'] }) # group edges by field def make_edge_groups(field): items = set(edge[field] for edge in artifact['edges'] if edge[field] is not None) for it in items: artifact['groups']['edge'].append({ 'name': it, 'items': [e['id'] for e in artifact['edges'] if e[field] == it] }) make_edge_groups('network collection') make_edge_groups('network type') set_status(artifact, 'created') print('made it to create networks') id = client.nav.networks.insert(artifact) job['network'] = { '_id': id, 'nodes': len(artifact['nodes']), 'edges': len(artifact['edges']) } set_status(job, 'success') client.nav.jobs.save(job) # inform parent thread of job completion event.set()
def main(): client = pymongo.MongoClient() db = client.networks # collection stores metadata about source networks meta = db.meta # collection stores edge data edges = db.edges # create index, if necessary create_edges_index() # get list of previously loaded networks to delete, if any _ids = [result['_id'] for result in meta.find({'collection': 'humannet'})] # From http://www.functionalnet.org/humannet/HumanNet.v1.evidence_code.txt: # File format: [gene1] [gene2] [CE-CC] [CE-CX] [CE-GT] [CE-LC] [CE-YH] [DM-PI] [HS-CC] [HS-CX] [HS-DC] [HS-GN] [HS-LC] [HS-MS] [HS-PG] [HS-YH] [SC-CC] [SC-CX] [SC-GT] [SC-LC] [SC-MS] [SC-TS] [SC-YH] [IntNet] # CE-CC = Co-citation of worm gene # CE-CX = Co-expression among worm genes # CE-GT = Worm genetic interactions # CE-LC = Literature curated worm protein physical interactions # CE-YH = High-throughput yeast 2-hybrid assays among worm genes # DM-PI = Fly protein physical interactions # HS-CC = Co-citation of human genes # HS-CX = Co-expression among human genes # HS-DC = Co-occurrence of domains among human proteins # HS-GN = Gene neighbourhoods of bacterial and archaeal orthologs of human genes # HS-LC = Literature curated human protein physical interactions # HS-MS = human protein complexes from affinity purification/mass spectrometry # HS-PG = Co-inheritance of bacterial and archaeal orthologs of human genes # HS-YH = High-throughput yeast 2-hybrid assays among human genes # SC-CC = Co-citation of yeast genes # SC-CX = Co-expression among yeast genes # SC-GT = Yeast genetic interactions # SC-LC = Literature curated yeast protein physical interactions # SC-MS = Yeast protein complexes from affinity purification/mass spectrometry # SC-TS = Yeast protein interactions inferred from tertiary structures of complexes # SC-YH = High-throughput yeast 2-hybrid assays among yeast genes # IntNet = Integrated network (HumanNet) columns = [ 'co-citation of worm gene', 'co-expression among worm genes', 'worm genetic interactions', 'literature curated worm protein physical interactions', 'high-throughput yeast 2-hybrid assays among worm genes', 'fly protein physical interactions', 'co-citation of human genes', 'co-expression among human genes', 'co-occurrence of domains among human proteins', 'gene neighbourhoods of bacterial and archaeal orthologs of human genes', 'literature curated human protein physical interactions', 'human protein complexes from affinity purification/mass spectrometry', 'co-inheritance of bacterial and archaeal orthologs of human genes', 'high-throughput yeast 2-hybrid assays among human genes', 'co-citation of yeast genes', 'co-expression among yeast genes', 'yeast genetic interactions', 'literature curated yeast protein physical interactions', 'yeast protein complexes from affinity purification/mass spectrometry', 'yeast protein interactions inferred from tertiary structures of complexes', 'high-throughput yeast 2-hybrid assays among yeast genes' ] metadata = {} for column in columns: m = {'collection': 'humannet', 'name': column, 'count': 0} set_status(m, 'parsing') m['_id'] = meta.insert_one(m).inserted_id metadata[column] = m url = 'http://www.functionalnet.org/humannet/HumanNet.v1.join.txt' log.info('reading network list from %s', url) r = requests.get(url) lines = list(r.iter_lines()) count = 0 iterator = parse(columns, metadata, lines) while True: records = [record for record in islice(iterator, 1000)] if len(records) > 0: name_to_id = genemania.id_lookup_table( set(it['source'] for it in records) | set(it['target'] for it in records)) for record in records: source = name_to_id[record['source']] if source is None: log.warning('unknown source %s', record['source']) record['source'] = source target = name_to_id[record['target']] if target is None: log.warning('unknown target %s', record['target']) record['target'] = target records = [ record for record in records if record['source'] is not None and record['target'] is not None ] count += len(records) edges.insert_many(records) log.debug('inserted %d edges (%d total)', len(records), count) else: break for m in metadata.itervalues(): set_status(m, 'success') meta.replace_one({'_id': m['_id']}, m) if len(_ids) > 0: log.info('dropping old network metadata') meta.delete_many({'_id': {'$in': _ids}}) cleanup_edges() return 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('--warmstart', action='store_true', help='warmstart') args = parser.parse_args() if not args.warmstart: load_identifiers() client = pymongo.MongoClient() db = client.networks # collection stores metadata about source networks meta = db.meta # collection stores edge data edges = db.edges create_edges_index() url = 'http://genemania.org/data/current/Homo_sapiens/networks.txt' log.info('reading network list from %s', url) r = requests.get(url) lines = list(r.iter_lines())[1:] # ignore header line status = Status('networks', logger=log).n(len(lines)).start() for idx, line in enumerate(lines): status.log(idx) file_name, network_group_name, network_name, source, pubmed_id = line.split('\t') metadata = { 'collection': 'identifiers', 'type': network_group_name.lower(), 'source': source, 'name': network_name, 'pubmed': int(pubmed_id) if not pubmed_id == '' else 0 } if not args.warmstart or meta.find_one(dict(metadata.items() + [('status', 'success')])) is None: # old metadata records and their associated edges will be dropped after the new network is finished processing _ids = [result['_id'] for result in meta.find(metadata)] log.info('found %d matching network(s) that will be replaced: %s', len(_ids), ', '.join([str(_id) for _id in _ids])) set_status(metadata, 'parsing') _id = meta.insert_one(metadata).inserted_id metadata['count'] = load_network('http://genemania.org/data/current/Homo_sapiens/' + file_name, _id) log.info('%s %s %s network has %d edges', metadata['source'], metadata['name'], metadata['type'], metadata['count']) set_status(metadata, 'success') meta.save(metadata) if len(_ids) > 0: log.info('dropping old network metadata') meta.delete_many({'_id': {'$in': _ids}}) log.info('dropping old edge data') edges.delete_many({'meta': {'$nin': [it['_id'] for it in meta.find()]}}) status.stop() return 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('--id', action='store_true', help='load identifiers only') parser.add_argument('--batch', type=int, default=10000, help='insert records batch size') parser.add_argument('--warmstart', action='store_true', help='warmstart') args = parser.parse_args() if not args.warmstart: load_identifiers() if not args.id: client = pymongo.MongoClient() db = client.networks # collection stores metadata about source networks meta = db.meta # collection stores edge data edges = db.edges create_edges_index() url = 'http://genemania.org/data/current/Homo_sapiens/networks.txt' log.info('reading network list from %s', url) r = requests.get(url) lines = list(r.iter_lines())[1:] # ignore header line status = Status('networks', logger=log).n(len(lines)).start() for idx, line in enumerate(lines): status.log(idx) file_name, network_group_name, network_name, source, pubmed_id = line.split('\t') metadata = { 'collection': 'genemania', 'type': network_group_name.lower(), 'source': source, 'name': network_name, 'pubmed': int(pubmed_id) if not pubmed_id == '' else 0 } if not args.warmstart or meta.find_one(dict(metadata.items() + [('status', 'success')])) is None: # old metadata records and their associated edges will be dropped after the new network is finished processing _ids = [result['_id'] for result in meta.find(metadata)] log.info('found %d matching network(s) that will be replaced: %s', len(_ids), ', '.join([str(_id) for _id in _ids])) set_status(metadata, 'parsing') _id = meta.insert_one(metadata).inserted_id metadata['count'] = load_network('http://genemania.org/data/current/Homo_sapiens/' + file_name, _id, args.batch) log.info('%s %s %s network has %d edges', metadata['source'], metadata['name'], metadata['type'], metadata['count']) set_status(metadata, 'success') meta.save(metadata) if len(_ids) > 0: log.info('dropping old network metadata') meta.delete_many({'_id': {'$in': _ids}}) cleanup_edges() status.stop() return 0
def put_state(id, state): client = pymongo.MongoClient(app.mongodb_uri) project = client.nav.projects.find_one({'_id': ObjectId(id)}) project['state'] = state set_status(project, 'updated') client.nav.projects.save(project)
def main(): client = pymongo.MongoClient() db = client.networks # collection stores metadata about source networks meta = db.meta # collection stores edge data edges = db.edges # create index, if necessary create_edges_index() # get list of previously loaded networks to delete, if any _ids = [result['_id'] for result in meta.find({'collection': 'humannet'})] # From http://www.functionalnet.org/humannet/HumanNet.v1.evidence_code.txt: # File format: [gene1] [gene2] [CE-CC] [CE-CX] [CE-GT] [CE-LC] [CE-YH] [DM-PI] [HS-CC] [HS-CX] [HS-DC] [HS-GN] [HS-LC] [HS-MS] [HS-PG] [HS-YH] [SC-CC] [SC-CX] [SC-GT] [SC-LC] [SC-MS] [SC-TS] [SC-YH] [IntNet] # CE-CC = Co-citation of worm gene # CE-CX = Co-expression among worm genes # CE-GT = Worm genetic interactions # CE-LC = Literature curated worm protein physical interactions # CE-YH = High-throughput yeast 2-hybrid assays among worm genes # DM-PI = Fly protein physical interactions # HS-CC = Co-citation of human genes # HS-CX = Co-expression among human genes # HS-DC = Co-occurrence of domains among human proteins # HS-GN = Gene neighbourhoods of bacterial and archaeal orthologs of human genes # HS-LC = Literature curated human protein physical interactions # HS-MS = human protein complexes from affinity purification/mass spectrometry # HS-PG = Co-inheritance of bacterial and archaeal orthologs of human genes # HS-YH = High-throughput yeast 2-hybrid assays among human genes # SC-CC = Co-citation of yeast genes # SC-CX = Co-expression among yeast genes # SC-GT = Yeast genetic interactions # SC-LC = Literature curated yeast protein physical interactions # SC-MS = Yeast protein complexes from affinity purification/mass spectrometry # SC-TS = Yeast protein interactions inferred from tertiary structures of complexes # SC-YH = High-throughput yeast 2-hybrid assays among yeast genes # IntNet = Integrated network (HumanNet) columns = [ 'co-citation of worm gene', 'co-expression among worm genes', 'worm genetic interactions', 'literature curated worm protein physical interactions', 'high-throughput yeast 2-hybrid assays among worm genes', 'fly protein physical interactions', 'co-citation of human genes', 'co-expression among human genes', 'co-occurrence of domains among human proteins', 'gene neighbourhoods of bacterial and archaeal orthologs of human genes', 'literature curated human protein physical interactions', 'human protein complexes from affinity purification/mass spectrometry', 'co-inheritance of bacterial and archaeal orthologs of human genes', 'high-throughput yeast 2-hybrid assays among human genes', 'co-citation of yeast genes', 'co-expression among yeast genes', 'yeast genetic interactions', 'literature curated yeast protein physical interactions', 'yeast protein complexes from affinity purification/mass spectrometry', 'yeast protein interactions inferred from tertiary structures of complexes', 'high-throughput yeast 2-hybrid assays among yeast genes' ] metadata = {} for column in columns: m = { 'collection': 'humannet', 'name': column, 'count': 0 } set_status(m, 'parsing') m['_id'] = meta.insert_one(m).inserted_id metadata[column] = m url = 'http://www.functionalnet.org/humannet/HumanNet.v1.join.txt' log.info('reading network list from %s', url) r = requests.get(url) lines = list(r.iter_lines()) count = 0 iterator = parse(columns, metadata, lines) while True: records = [record for record in islice(iterator, 1000)] if len(records) > 0: name_to_id = genemania.id_lookup_table(set(it['source'] for it in records) | set(it['target'] for it in records)) for record in records: source = name_to_id[record['source']] if source is None: log.warning('unknown source %s', record['source']) record['source'] = source target = name_to_id[record['target']] if target is None: log.warning('unknown target %s', record['target']) record['target'] = target records = [record for record in records if record['source'] is not None and record['target'] is not None] count += len(records) edges.insert_many(records) log.debug('inserted %d edges (%d total)', len(records), count) else: break for m in metadata.itervalues(): set_status(m, 'success') meta.replace_one({'_id': m['_id']}, m) if len(_ids) > 0: log.info('dropping old network metadata') meta.delete_many({'_id': {'$in': _ids}}) cleanup_edges() return 0