Ejemplo n.º 1
0
    def run(self):
        while True:
            if len(multiprocessing.active_children()
                   ) < multiprocessing.cpu_count():
                # look for the next pending job
                client = pymongo.MongoClient(app.mongodb_uri)
                job = client.nav.jobs.find_one({'status': 'pending'})
                if job:
                    # start the job as a new process
                    set_status(job, 'processing')
                    client.nav.jobs.save(job)
                    try:
                        process = multiprocessing.Process(
                            name=self.process_name(job['_id']),
                            target=run_job,
                            args=(job, self.event))
                        process.start()
                        log.info('processing job %s', job['_id'])
                    except Exception as e:
                        set_status(job, 'error')
                        client.nav.jobs.save(job)
                        log.error('could not start job %s (%s)', job['_id'],
                                  e.message)

            # wait for events like new, cancelled, or finished jobs
            self.event.wait()
            self.event.clear()
Ejemplo n.º 2
0
    def cancel(self, id):
        client = pymongo.MongoClient(app.mongodb_uri)
        job = client.nav.jobs.find_one({'_id': id})

        if job:
            log.info('cancelling job %s with status %s', id, job['status'])

            # look for job among active processes
            for process in multiprocessing.active_children():
                if process.name == self.process_name(id):
                    # send terminate signal to process and wait for it to finish
                    log.info('terminating job %s', id)
                    process.terminate()

                    log.info('waiting for job %s to terminate', id)
                    process.join()

                    log.info('job %s terminated', id)

                    # inform the job queue thread
                    self.event.set()

            set_status(job, 'cancelled')
            client.nav.jobs.save(job)

        else:
            log.error('cancel job failed because job %s was not found', id)
            raise LookupError
Ejemplo n.º 3
0
def save_network():
    client = pymongo.MongoClient(app.mongodb_uri)
    network = request.json
    network = deserialize(network)  # convert string id to ObjectId
    set_status(network, 'updated')
    client.nav.networks.save(network)
    return {'timestamp': network['timestamp']['updated']}
Ejemplo n.º 4
0
    def cancel(self, id):
        client = pymongo.MongoClient(app.mongodb_uri)
        job = client.nav.jobs.find_one({'_id': id})

        if job:
            log.info('cancelling job %s with status %s', id, job['status'])

            # look for job among active processes
            for process in multiprocessing.active_children():
                if process.name == self.process_name(id):
                    # send terminate signal to process and wait for it to finish
                    log.info('terminating job %s', id)
                    process.terminate()

                    log.info('waiting for job %s to terminate', id)
                    process.join()

                    log.info('job %s terminated', id)

                    # inform the job queue thread
                    self.event.set()

            set_status(job, 'cancelled')
            client.nav.jobs.save(job)

        else:
            log.error('cancel job failed because job %s was not found', id)
            raise LookupError
Ejemplo n.º 5
0
    def submit(self, project_id):
        client = pymongo.MongoClient(app.mongodb_uri)
        project = client.nav.projects.find_one({'_id': project_id})

        if project:

            def plural(n, singular, plural=None):
                str = '{} '.format(n)
                if n == 1:
                    return str + singular
                else:
                    return str + (plural if plural is not None else singular +
                                  's')

            name = plural(len(project['gene_list']), 'Gene')

            n = len(project.get('files', []))
            if n > 0:
                name += ' + ' + plural(n, 'File')

            if project.get('include_neighbors'):
                name += ' + ' + (plural(project['n_hottest_neighbors'],
                                        'Hottest Neighbor') if
                                 project.get('do_heat_diffusion') else plural(
                                     project['n_connected_neighbors'],
                                     'Most Connected Neighbor'))

            # create new job
            job = {'project': project, 'name': name}
            set_status(job, 'pending')

            # add snapshot of successful file meta-data to job and reset status
            project['files'] = [
                it for it in client.files.meta.find({
                    '_id': {
                        '$in': [it['_id'] for it in project.get('files', [])]
                    }
                }) if it['status'] == 'success'
            ]
            for file in project['files']:
                del file['status']
                del file['timestamp']

            job_id = client.nav.jobs.insert(job)

            log.info('submit project %s succeeded as job %s', project_id,
                     job_id)

            # FIXME comment out the following line to run jobs in a separate thread (uncomment it to enable debug breakpoints in the job)
            #run_job(job, self.event)

            # inform the job queue thread
            self.event.set()
        else:
            log.error('submit project failed because project %s was not found',
                      project_id)
            raise LookupError()
Ejemplo n.º 6
0
def update_project():
    client = pymongo.MongoClient(app.mongodb_uri)
    project = request.json
    try:
        project = deserialize(project)  # convert string id to ObjectId
        set_status(project, 'updated')
        client.nav.projects.save(project)
        return {'timestamp': project['timestamp']['updated']}
    except:
        return HTTPError(400)
Ejemplo n.º 7
0
def create_project():
    client = pymongo.MongoClient(app.mongodb_uri)
    project = {
        'gene_list': [],
        'include_neighbors': True,
        'n_connected_neighbors': 20,
        'n_hottest_neighbors': 20,
        'do_heat_diffusion': False
    }
    set_status(project, 'created')
    project['_id'] = str(client.nav.projects.insert(project))
    return serialize(project)
Ejemplo n.º 8
0
    def submit(self, project_id):
        client = pymongo.MongoClient(app.mongodb_uri)
        project = client.nav.projects.find_one({'_id': project_id})

        if project:

            def plural(n, singular, plural=None):
                str = '{} '.format(n)
                if n == 1:
                    return str + singular
                else:
                    return str + (plural if plural is not None else singular + 's')

            name = plural(len(project['gene_list']), 'Gene')


            n = len(project.get('files', []))
            if n > 0:
                name += ' + ' + plural(n, 'File')

            if project.get('include_neighbors'):
                name += ' + ' + (plural(project['n_hottest_neighbors'], 'Hottest Neighbor') if project.get('do_heat_diffusion') else plural(project['n_connected_neighbors'], 'Most Connected Neighbor'))

            # create new job
            job = {
                'project': project,
                'name': name
            }
            set_status(job, 'pending')

            # add snapshot of successful file meta-data to job and reset status
            project['files'] = [it for it in client.files.meta.find({'_id': {'$in': [it['_id'] for it in project.get('files', [])]}}) if it['status'] == 'success']
            for file in project['files']:
                del file['status']
                del file['timestamp']

            job_id = client.nav.jobs.insert(job)

            log.info('submit project %s succeeded as job %s', project_id, job_id)

            # FIXME comment out the following line to run jobs in a separate thread (uncomment it to enable debug breakpoints in the job)
            #run_job(job, self.event)

            # inform the job queue thread
            self.event.set()
        else:
            log.error('submit project failed because project %s was not found', project_id)
            raise LookupError()
Ejemplo n.º 9
0
    def run(self):
        while True:
            if len(multiprocessing.active_children()) < multiprocessing.cpu_count():
                # look for the next pending job
                client = pymongo.MongoClient(app.mongodb_uri)
                job = client.nav.jobs.find_one({'status': 'pending'})
                if job:
                    # start the job as a new process
                    set_status(job, 'processing')
                    client.nav.jobs.save(job)
                    try:
                        process = multiprocessing.Process(name=self.process_name(job['_id']), target=run_job, args=(job, self.event))
                        process.start()
                        log.info('processing job %s', job['_id'])
                    except Exception as e:
                        set_status(job, 'error')
                        client.nav.jobs.save(job)
                        log.error('could not start job %s (%s)', job['_id'], e.message)

            # wait for events like new, cancelled, or finished jobs
            self.event.wait()
            self.event.clear()
Ejemplo n.º 10
0
def run_job(job, event):
    ''' network analysis job, run as a separate process

    :param job: job object that contains a snapshot of the project
    :param event: used to inform parent thread of completion
    :return: all artifacts are saved to the database

    The job creates a network artifact, with values defined as follows:

    sources:
    {
       source1_id: source1, # file meta-data object
       source2_id: source2
    }

    nodes:
    [
       {
          'id': id,
          source_id: {key: value, ...} # data objects contain key/value pairs of node attributes from the different sources
       },
       ...
    ]

    edges:
    [
       {
          'source': source_id,
          'target': target_id,
          'source node': source_name,
          'target node': target_name,
          'type': type
       },
       ...
    ]
    '''

    job_query = {'_id': job['_id']}

    client = pymongo.MongoClient(app.mongodb_uri)
    project = job['project']

    # create a network artifact
    # nodes, edges, and groups are added to this artifact as part of the job
    artifact = {
        'project': job['project']['_id'],
        'job': job['_id'],
        'sources': {},
        'groups': {
            'node': [],
            'edge': []
        }
    }

    # id_to_name = {}
    # for gene in project['gene_list']:
    #     id = gene['Ensembl Gene ID']
    #     name = gene['Gene Name']
    #     id_to_name[id] = name


    query_genes = [str(gene['Entrez Gene ID']) for gene in project['gene_list'] if 'Entrez Gene ID' in gene]

    meta = {it['_id']: it for it in client.networks.meta.find()}

    meta_ids = [id.encode("ascii") for id, value in project['networks'].iteritems() if value]

    if project.get('include_neighbors', False):

        query_genes = set(query_genes)

        #clean set.
        # clean_query_genes = set()
        # for gene in query_genes:
        #     clean_query_genes.add( str(gene) )
        # query_genes = clean_query_genes

        client.nav.jobs.update(job_query, {'$set': {'status_message': 'finding genes in one-step neighborhood'}})

        # find one-step neighborhood around query genes (ignoring duplicate edges)
        # originally, we found all edges using a single query {'$or': [{'source': {'$in': query_genes}, 'target': {'$in': query_genes}], 'meta': {'$in': meta_ids}}
        # however, the approach below seems to be faster, perhaps due to the distinct operation
        degrees = defaultdict(int)
        for gene in query_genes:
            for target in client.networks.edges.find({'source': gene, 'meta': {'$in': meta_ids}}).distinct('target'):
                degrees[target] += 1
            for source in client.networks.edges.find({'target': gene, 'meta': {'$in': meta_ids}}).distinct('source'):
                degrees[source] += 1

        n = project['n_connected_neighbors']

        client.nav.jobs.update(job_query, {'$set': {'status_message': 'finding top {} neighbors'.format(n)}})

        # remove query genes from the degrees dictionary
        degrees = {key: value for key, value in degrees.iteritems() if key not in query_genes}

        # sort by descending value
        neighbors = [gene for gene, _ in sorted(degrees.items(), key=itemgetter(1), reverse=True)]

        # take top n neighbors
        neighbors = neighbors[:n]

        # final gene set contains the query genes and top n neighbors
        nodes = list(query_genes | set(neighbors))

        if project.get('do_heat_diffusion', False):
            # use heat diffusion algorithm to further refine the list of neighbors

            # find all edges within the one-step neighborhood
            client.nav.jobs.update(job_query, {'$set': {'status_message': 'finding edges for {} node one-step neighborhood'.format(len(nodes))}})
            edges = set()
            for edge in client.networks.edges.find({'source': {'$in': nodes}, 'target': {'$in': nodes}, 'meta': {'$in': meta_ids}}):
                edges.add(tuple(sorted((edge['source'], edge['target']))))

            client.nav.jobs.update(job_query, {'$set': {'status_message': 'calculating heat diffusion kernel for {} nodes and {} edges (this may take a while)'.format(len(nodes), len(edges))}})

            # calculate heat diffusion kernel (influence matrix)
            # THIS TAKES A LONG TIME
            kernel = SciPYKernel(edges)

            client.nav.jobs.update(job_query, {'$set': {'status_message': 'calculating heat diffusion result'}})

            weighted = 'heat_diffusion_weights' in project

            if weighted:
                try:
                    # w = {'file': <file meta object>, 'header': {'key': <column key>, ...}}
                    w = project['heat_diffusion_weights']
                    file = w['file']
                    id = str(file['_id'])
                    key = w['header']['key']

                    # vector = {name: |value|, ...}, where name is untranslated
                    # NOTE: we take the absolute value of the data because we can't have negative weights
                    #       this works well for values like fold change, where we're interested in either large positive or negative changes
                    #       any more sophisticated transforms should be provided in the input file
                    vector = {it[file['headers'][0]['key']]: math.fabs(float(it[key])) for it in client.files[id].find()}

                    # translate gene name to id
                    name_to_id = genemania.id_lookup_table(vector.keys())
                    vector = {name_to_id[key]: value for key, value in vector.iteritems()}

                    # reduce the vector to only the query genes
                    vector = {key: value for key, value in vector.iteritems() if key in query_genes}

                    # normalize weights
                    total = sum(vector.values())
                    vector = {key: value / total for key, value in vector.iteritems()}

                    # flag success if we made it this far
                    weighted = True

                except KeyError:
                    log.warning('weighted heat diffusion failed, reverting to uniform weights')

            if not weighted:
                # create uniform weighted heat vector (treats all query genes as equal point sources)
                vector = {gene: 1.0 / len(query_genes) for gene in query_genes}

            # calculate diffused heat
            metric = kernel.kernelMultiplyOne(vector)

            n = project.get('n_hottest_neighbors', 20)

            client.nav.jobs.update(job_query, {'$set': {'status_message': 'finding top {} neighbors'.format(n)}})

            # remove query genes from the metric
            metric = {key: value for key, value in metric.iteritems() if key not in query_genes}

            # sort by descending value
            neighbors = [gene for gene, _ in sorted(metric.items(), key=itemgetter(1), reverse=True)]

            # take top n neighbors
            neighbors = neighbors[:n]

            # final gene set contains the query genes and top n neighbors
            nodes = list(query_genes | set(neighbors))

    else:
        nodes = query_genes

    client.nav.jobs.update(job_query, {'$set': {'status_message': 'finding final edge list for {} genes'.format(len(nodes))}})

    # final edge list
    query = {
        'source': {'$in': nodes},
        'target': {'$in': nodes},
        'meta': {'$in': meta_ids}
    }

    # final edge list contains all edges in the final gene set
    edges = list(client.networks.edges.find(query, ['source', 'target', 'meta']))

    client.nav.jobs.update(job_query, {'$set': {'status_message': 'calculating degree for {} genes from {} edges'.format(len(nodes), len(edges))}})

    # calculate degrees based on final edge list (do not count duplicates)
    degrees = defaultdict(int)
    for source, target in set(tuple(sorted((edge['source'], edge['target']))) for edge in edges):
        degrees[source] += 1
        degrees[target] += 1

    # Note: This is no longer relevant. Delete soon. -- David Welker
    # id_to_name = genemania.name_lookup_table(nodes)
    id_to_name = {}
    for node in nodes:
        r = requests.get('http://mygene.info/v2/gene/'+str(node)+'?fields=symbol')
        id_to_name[node] = r.json()['symbol']


    # create node objects
    def make_node(node):
        return {
            'id': node,
            'name': id_to_name[node],
            'query': node in query_genes,
            'degree': degrees[node]
        }

    # for convenience in merging data, nodes are initially stored as a dict keyed by id
    # it will be converted to a list of values later
    artifact['nodes'] = {node: make_node(node) for node in nodes}

    # create edge objects
    def make_edge(idx, edge):
        m = meta[edge['meta']]
        return {
            'id': 'e{}'.format(idx),
            'source': edge['source'],
            'target': edge['target'],
            'source name': id_to_name[edge['source']],
            'target name': id_to_name[edge['target']],
            'network collection': m.get('collection'),
            'network type': m.get('type'),
            'network source': m.get('source'),
            'network name': m.get('name')
        }

    artifact['edges'] = [make_edge(idx, e) for idx, e in enumerate(edges)]

    # add data from uploaded files
    for file in project['files']:
        client.nav.jobs.update(job_query, {'$set': {'status_message': 'adding node data from {}'.format(file['name'])}})

        source_id = str(file['_id'])
        artifact['sources'][source_id] = file

        # merge node data from file and add source info
        key = file['headers'][0]['key']
        for data in client.files[str(file['_id'])].find():
            id = genemania.lookup_id(data.pop(key))
            if id in artifact['nodes']:
                data.pop('_id')
                for header in file['headers']:
                    try:
                        value = data[header['key']]
                    except Exception:
                        continue

                    if header['datatype'] == 'numeric':
                        value = to_numeric(value)
                    elif header['datatype'] == 'boolean':
                        value = to_boolean(value)

                    data[header['key']] = value

                artifact['nodes'][id][source_id] = data

        set_status(file, 'success')
        client.nav.jobs.save(job)

    # convert nodes dictionaries to list of values
    artifact['nodes'] = artifact['nodes'].values()

    client.nav.jobs.update(job_query, {'$set': {'status_message': 'calculating gene set enrichment'}})

    # add query genes to node_groups
    artifact['groups']['node'].append({
        'id': 'query',
        'name': 'Query Genes',
        'description': 'Set of {} gene{} in the user query.'.format(len(query_genes), '' if len(query_genes) == 1 else 's'),
        'items': list(query_genes)
    })

    # do gene set enrichment
    gene_list = [node['id'] for node in artifact['nodes']]
    enriched = go.gene_set_enrichment(gene_list)[:20]  # get the top 20 go terms

    for it in enriched:
        artifact['groups']['node'].append({
            'id': it['go'].replace('GO:', 'go'),
            'name': it['name'],
            'description': it['def'],
            'items': it['overlap'],
            'count': it['n_genes'],
            'pvalue': it['pvalue'],
            'qvalue': it['qvalue']
        })

    # group edges by field
    def make_edge_groups(field):
        items = set(edge[field] for edge in artifact['edges'] if edge[field] is not None)
        for it in items:
            artifact['groups']['edge'].append({
                'name': it,
                'items': [e['id'] for e in artifact['edges'] if e[field] == it]
            })

    make_edge_groups('network collection')
    make_edge_groups('network type')

    set_status(artifact, 'created')
    id = client.nav.networks.insert(artifact)

    job['network'] = {
        '_id': id,
        'nodes': len(artifact['nodes']),
        'edges': len(artifact['edges'])
    }

    set_status(job, 'success')
    client.nav.jobs.save(job)

    # inform parent thread of job completion
    event.set()
Ejemplo n.º 11
0
def run_job(job, event):
    ''' network analysis job, run as a separate process

    :param job: job object that contains a snapshot of the project
    :param event: used to inform parent thread of completion
    :return: all artifacts are saved to the database

    The job creates a network artifact, with values defined as follows:

    sources:
    {
       source1_id: source1, # file meta-data object
       source2_id: source2
    }

    nodes:
    [
       {
          'id': id,
          source_id: {key: value, ...} # data objects contain key/value pairs of node attributes from the different sources
       },
       ...
    ]

    edges:
    [
       {
          'source': source_id,
          'target': target_id,
          'source node': source_name,
          'target node': target_name,
          'type': type
       },
       ...
    ]
    '''

    job_query = {'_id': job['_id']}

    client = pymongo.MongoClient(app.mongodb_uri)
    project = job['project']

    # create a network artifact
    # nodes, edges, and groups are added to this artifact as part of the job
    artifact = {
        'project': job['project']['_id'],
        'job': job['_id'],
        'sources': {},
        'groups': {
            'node': [],
            'edge': []
        }
    }

    query_genes = [
        gene['Ensembl Gene ID'] for gene in project['gene_list']
        if 'Ensembl Gene ID' in gene
    ]

    meta = {it['_id']: it for it in client.networks.meta.find()}

    meta_ids = [
        ObjectId(id) for id, value in project['networks'].iteritems() if value
    ]

    if project.get('include_neighbors', False):

        query_genes = set(query_genes)

        client.nav.jobs.update(job_query, {
            '$set': {
                'status_message': 'finding genes in one-step neighborhood'
            }
        })

        # find one-step neighborhood around query genes (ignoring duplicate edges)
        # originally, we found all edges using a single query {'$or': [{'source': {'$in': query_genes}, 'target': {'$in': query_genes}], 'meta': {'$in': meta_ids}}
        # however, the approach below seems to be faster, perhaps due to the distinct operation
        degrees = defaultdict(int)
        for gene in query_genes:
            for target in client.networks.edges.find({
                    'source': gene,
                    'meta': {
                        '$in': meta_ids
                    }
            }).distinct('target'):
                degrees[target] += 1
            for source in client.networks.edges.find({
                    'target': gene,
                    'meta': {
                        '$in': meta_ids
                    }
            }).distinct('source'):
                degrees[source] += 1

        n = project['n_connected_neighbors']

        client.nav.jobs.update(
            job_query,
            {'$set': {
                'status_message': 'finding top {} neighbors'.format(n)
            }})

        # remove query genes from the degrees dictionary
        degrees = {
            key: value
            for key, value in degrees.iteritems() if key not in query_genes
        }

        # sort by descending value
        neighbors = [
            gene for gene, _ in sorted(
                degrees.items(), key=itemgetter(1), reverse=True)
        ]

        # take top n neighbors
        neighbors = neighbors[:n]

        # final gene set contains the query genes and top n neighbors
        nodes = list(query_genes | set(neighbors))

        if project.get('do_heat_diffusion', False):
            # use heat diffusion algorithm to further refine the list of neighbors

            # find all edges within the one-step neighborhood
            client.nav.jobs.update(
                job_query, {
                    '$set': {
                        'status_message':
                        'finding edges for {} node one-step neighborhood'.
                        format(len(nodes))
                    }
                })
            edges = set()
            for edge in client.networks.edges.find({
                    'source': {
                        '$in': nodes
                    },
                    'target': {
                        '$in': nodes
                    },
                    'meta': {
                        '$in': meta_ids
                    }
            }):
                edges.add(tuple(sorted((edge['source'], edge['target']))))

            client.nav.jobs.update(
                job_query, {
                    '$set': {
                        'status_message':
                        'calculating heat diffusion kernel for {} nodes and {} edges (this may take a while)'
                        .format(len(nodes), len(edges))
                    }
                })

            # calculate heat diffusion kernel (influence matrix)
            # THIS TAKES A LONG TIME
            kernel = SciPYKernel(edges)

            client.nav.jobs.update(job_query, {
                '$set': {
                    'status_message': 'calculating heat diffusion result'
                }
            })

            weighted = 'heat_diffusion_weights' in project

            if weighted:
                try:
                    # w = {'file': <file meta object>, 'header': {'key': <column key>, ...}}
                    w = project['heat_diffusion_weights']
                    file = w['file']
                    id = str(file['_id'])
                    key = w['header']['key']

                    # vector = {name: |value|, ...}, where name is untranslated
                    # NOTE: we take the absolute value of the data because we can't have negative weights
                    #       this works well for values like fold change, where we're interested in either large positive or negative changes
                    #       any more sophisticated transforms should be provided in the input file
                    vector = {
                        it[file['headers'][0]['key']]:
                        math.fabs(float(it[key]))
                        for it in client.files[id].find()
                    }

                    # translate gene name to id
                    name_to_id = genemania.id_lookup_table(vector.keys())
                    vector = {
                        name_to_id[key]: value
                        for key, value in vector.iteritems()
                    }

                    # reduce the vector to only the query genes
                    vector = {
                        key: value
                        for key, value in vector.iteritems()
                        if key in query_genes
                    }

                    # normalize weights
                    total = sum(vector.values())
                    vector = {
                        key: value / total
                        for key, value in vector.iteritems()
                    }

                    # flag success if we made it this far
                    weighted = True

                except KeyError:
                    log.warning(
                        'weighted heat diffusion failed, reverting to uniform weights'
                    )

            if not weighted:
                # create uniform weighted heat vector (treats all query genes as equal point sources)
                vector = {gene: 1.0 / len(query_genes) for gene in query_genes}

            # calculate diffused heat
            metric = kernel.kernelMultiplyOne(vector)

            n = project.get('n_hottest_neighbors', 20)

            client.nav.jobs.update(job_query, {
                '$set': {
                    'status_message': 'finding top {} neighbors'.format(n)
                }
            })

            # remove query genes from the metric
            metric = {
                key: value
                for key, value in metric.iteritems() if key not in query_genes
            }

            # sort by descending value
            neighbors = [
                gene for gene, _ in sorted(
                    metric.items(), key=itemgetter(1), reverse=True)
            ]

            # take top n neighbors
            neighbors = neighbors[:n]

            # final gene set contains the query genes and top n neighbors
            nodes = list(query_genes | set(neighbors))

    else:
        nodes = query_genes

    client.nav.jobs.update(
        job_query, {
            '$set': {
                'status_message':
                'finding final edge list for {} genes'.format(len(nodes))
            }
        })

    # final edge list
    query = {
        'source': {
            '$in': nodes
        },
        'target': {
            '$in': nodes
        },
        'meta': {
            '$in': meta_ids
        }
    }

    # final edge list contains all edges in the final gene set
    edges = list(
        client.networks.edges.find(query, ['source', 'target', 'meta']))

    client.nav.jobs.update(
        job_query, {
            '$set': {
                'status_message':
                'calculating degree for {} genes from {} edges'.format(
                    len(nodes), len(edges))
            }
        })

    # calculate degrees based on final edge list (do not count duplicates)
    degrees = defaultdict(int)
    for source, target in set(
            tuple(sorted((edge['source'], edge['target']))) for edge in edges):
        degrees[source] += 1
        degrees[target] += 1

    id_to_name = genemania.name_lookup_table(nodes)

    # create node objects
    def make_node(node):
        return {
            'id': node,
            'name': id_to_name[node],
            'query': node in query_genes,
            'degree': degrees[node]
        }

    # for convenience in merging data, nodes are initially stored as a dict keyed by id
    # it will be converted to a list of values later
    artifact['nodes'] = {node: make_node(node) for node in nodes}

    # create edge objects
    def make_edge(idx, edge):
        m = meta[edge['meta']]
        return {
            'id': 'e{}'.format(idx),
            'source': edge['source'],
            'target': edge['target'],
            'source name': id_to_name[edge['source']],
            'target name': id_to_name[edge['target']],
            'network collection': m.get('collection'),
            'network type': m.get('type'),
            'network source': m.get('source'),
            'network name': m.get('name')
        }

    artifact['edges'] = [make_edge(idx, e) for idx, e in enumerate(edges)]

    # add data from uploaded files
    for file in project['files']:
        client.nav.jobs.update(
            job_query, {
                '$set': {
                    'status_message':
                    'adding node data from {}'.format(file['name'])
                }
            })

        source_id = str(file['_id'])
        artifact['sources'][source_id] = file

        # merge node data from file and add source info
        key = file['headers'][0]['key']
        for data in client.files[str(file['_id'])].find():
            id = genemania.lookup_id(data.pop(key))
            if id in artifact['nodes']:
                data.pop('_id')
                for header in file['headers']:
                    try:
                        value = data[header['key']]
                    except Exception:
                        continue

                    if header['datatype'] == 'numeric':
                        value = to_numeric(value)
                    elif header['datatype'] == 'boolean':
                        value = to_boolean(value)

                    data[header['key']] = value

                artifact['nodes'][id][source_id] = data

        set_status(file, 'success')
        client.nav.jobs.save(job)

    # convert nodes dictionaries to list of values
    artifact['nodes'] = artifact['nodes'].values()

    client.nav.jobs.update(
        job_query,
        {'$set': {
            'status_message': 'calculating gene set enrichment'
        }})

    # add query genes to node_groups
    artifact['groups']['node'].append({
        'id':
        'query',
        'name':
        'Query Genes',
        'description':
        'Set of {} gene{} in the user query.'.format(
            len(query_genes), '' if len(query_genes) == 1 else 's'),
        'items':
        list(query_genes)
    })

    # do gene set enrichment
    gene_list = [node['id'] for node in artifact['nodes']]
    enriched = go.gene_set_enrichment(gene_list)[:
                                                 20]  # get the top 20 go terms

    for it in enriched:
        artifact['groups']['node'].append({
            'id': it['go'].replace('GO:', 'go'),
            'name': it['name'],
            'description': it['def'],
            'items': it['overlap'],
            'count': it['n_genes'],
            'pvalue': it['pvalue'],
            'qvalue': it['qvalue']
        })

    # group edges by field
    def make_edge_groups(field):
        items = set(edge[field] for edge in artifact['edges']
                    if edge[field] is not None)
        for it in items:
            artifact['groups']['edge'].append({
                'name':
                it,
                'items':
                [e['id'] for e in artifact['edges'] if e[field] == it]
            })

    make_edge_groups('network collection')
    make_edge_groups('network type')

    set_status(artifact, 'created')
    print('made it to create networks')
    id = client.nav.networks.insert(artifact)

    job['network'] = {
        '_id': id,
        'nodes': len(artifact['nodes']),
        'edges': len(artifact['edges'])
    }

    set_status(job, 'success')
    client.nav.jobs.save(job)

    # inform parent thread of job completion
    event.set()
Ejemplo n.º 12
0
def main():
    client = pymongo.MongoClient()
    db = client.networks

    # collection stores metadata about source networks
    meta = db.meta

    # collection stores edge data
    edges = db.edges

    # create index, if necessary
    create_edges_index()

    # get list of previously loaded networks to delete, if any
    _ids = [result['_id'] for result in meta.find({'collection': 'humannet'})]

    # From http://www.functionalnet.org/humannet/HumanNet.v1.evidence_code.txt:
    # File format: [gene1] [gene2] [CE-CC] [CE-CX] [CE-GT] [CE-LC] [CE-YH] [DM-PI] [HS-CC] [HS-CX] [HS-DC] [HS-GN] [HS-LC] [HS-MS] [HS-PG] [HS-YH] [SC-CC] [SC-CX] [SC-GT] [SC-LC] [SC-MS] [SC-TS] [SC-YH] [IntNet]
    # CE-CC = Co-citation of worm gene
    # CE-CX = Co-expression among worm genes
    # CE-GT = Worm genetic interactions
    # CE-LC = Literature curated worm protein physical interactions
    # CE-YH = High-throughput yeast 2-hybrid assays among worm genes
    # DM-PI = Fly protein physical interactions
    # HS-CC = Co-citation of human genes
    # HS-CX = Co-expression among human genes
    # HS-DC = Co-occurrence of domains among human proteins
    # HS-GN = Gene neighbourhoods of bacterial and archaeal orthologs of human genes
    # HS-LC = Literature curated human protein physical interactions
    # HS-MS = human protein complexes from affinity purification/mass spectrometry
    # HS-PG = Co-inheritance of bacterial and archaeal orthologs of human genes
    # HS-YH = High-throughput yeast 2-hybrid assays among human genes
    # SC-CC = Co-citation of yeast genes
    # SC-CX = Co-expression among yeast genes
    # SC-GT = Yeast genetic interactions
    # SC-LC = Literature curated yeast protein physical interactions
    # SC-MS = Yeast protein complexes from affinity purification/mass spectrometry
    # SC-TS = Yeast protein interactions inferred from tertiary structures of complexes
    # SC-YH = High-throughput yeast 2-hybrid assays among yeast genes
    # IntNet = Integrated network (HumanNet)

    columns = [
        'co-citation of worm gene', 'co-expression among worm genes',
        'worm genetic interactions',
        'literature curated worm protein physical interactions',
        'high-throughput yeast 2-hybrid assays among worm genes',
        'fly protein physical interactions', 'co-citation of human genes',
        'co-expression among human genes',
        'co-occurrence of domains among human proteins',
        'gene neighbourhoods of bacterial and archaeal orthologs of human genes',
        'literature curated human protein physical interactions',
        'human protein complexes from affinity purification/mass spectrometry',
        'co-inheritance of bacterial and archaeal orthologs of human genes',
        'high-throughput yeast 2-hybrid assays among human genes',
        'co-citation of yeast genes', 'co-expression among yeast genes',
        'yeast genetic interactions',
        'literature curated yeast protein physical interactions',
        'yeast protein complexes from affinity purification/mass spectrometry',
        'yeast protein interactions inferred from tertiary structures of complexes',
        'high-throughput yeast 2-hybrid assays among yeast genes'
    ]

    metadata = {}

    for column in columns:
        m = {'collection': 'humannet', 'name': column, 'count': 0}
        set_status(m, 'parsing')
        m['_id'] = meta.insert_one(m).inserted_id
        metadata[column] = m

    url = 'http://www.functionalnet.org/humannet/HumanNet.v1.join.txt'
    log.info('reading network list from %s', url)
    r = requests.get(url)
    lines = list(r.iter_lines())

    count = 0

    iterator = parse(columns, metadata, lines)
    while True:
        records = [record for record in islice(iterator, 1000)]
        if len(records) > 0:
            name_to_id = genemania.id_lookup_table(
                set(it['source'] for it in records) | set(it['target']
                                                          for it in records))
            for record in records:
                source = name_to_id[record['source']]
                if source is None:
                    log.warning('unknown source %s', record['source'])
                record['source'] = source

                target = name_to_id[record['target']]
                if target is None:
                    log.warning('unknown target %s', record['target'])
                record['target'] = target

            records = [
                record for record in records if record['source'] is not None
                and record['target'] is not None
            ]
            count += len(records)
            edges.insert_many(records)
            log.debug('inserted %d edges (%d total)', len(records), count)
        else:
            break

    for m in metadata.itervalues():
        set_status(m, 'success')
        meta.replace_one({'_id': m['_id']}, m)

    if len(_ids) > 0:
        log.info('dropping old network metadata')
        meta.delete_many({'_id': {'$in': _ids}})

    cleanup_edges()

    return 0
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--warmstart', action='store_true', help='warmstart')
    args = parser.parse_args()

    if not args.warmstart:
        load_identifiers()

    client = pymongo.MongoClient()
    db = client.networks

    # collection stores metadata about source networks
    meta = db.meta

    # collection stores edge data
    edges = db.edges

    create_edges_index()

    url = 'http://genemania.org/data/current/Homo_sapiens/networks.txt'
    log.info('reading network list from %s', url)
    r = requests.get(url)
    lines = list(r.iter_lines())[1:] # ignore header line

    status = Status('networks', logger=log).n(len(lines)).start()
    for idx, line in enumerate(lines):
        status.log(idx)
        file_name, network_group_name, network_name, source, pubmed_id = line.split('\t')

        metadata = {
            'collection': 'identifiers',
            'type': network_group_name.lower(),
            'source': source,
            'name': network_name,
            'pubmed': int(pubmed_id) if not pubmed_id == '' else 0
        }

        if not args.warmstart or meta.find_one(dict(metadata.items() + [('status', 'success')])) is None:

            # old metadata records and their associated edges will be dropped after the new network is finished processing
            _ids = [result['_id'] for result in meta.find(metadata)]
            log.info('found %d matching network(s) that will be replaced: %s', len(_ids), ', '.join([str(_id) for _id in _ids]))

            set_status(metadata, 'parsing')
            _id = meta.insert_one(metadata).inserted_id

            metadata['count'] = load_network('http://genemania.org/data/current/Homo_sapiens/' + file_name, _id)
            log.info('%s %s %s network has %d edges', metadata['source'], metadata['name'], metadata['type'], metadata['count'])

            set_status(metadata, 'success')
            meta.save(metadata)

            if len(_ids) > 0:
                log.info('dropping old network metadata')
                meta.delete_many({'_id': {'$in': _ids}})

    log.info('dropping old edge data')
    edges.delete_many({'meta': {'$nin': [it['_id'] for it in meta.find()]}})

    status.stop()
    return 0
Ejemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--id', action='store_true', help='load identifiers only')
    parser.add_argument('--batch', type=int, default=10000, help='insert records batch size')
    parser.add_argument('--warmstart', action='store_true', help='warmstart')
    args = parser.parse_args()

    if not args.warmstart:
        load_identifiers()

    if not args.id:

        client = pymongo.MongoClient()
        db = client.networks

        # collection stores metadata about source networks
        meta = db.meta

        # collection stores edge data
        edges = db.edges

        create_edges_index()

        url = 'http://genemania.org/data/current/Homo_sapiens/networks.txt'
        log.info('reading network list from %s', url)
        r = requests.get(url)
        lines = list(r.iter_lines())[1:] # ignore header line

        status = Status('networks', logger=log).n(len(lines)).start()
        for idx, line in enumerate(lines):
            status.log(idx)
            file_name, network_group_name, network_name, source, pubmed_id = line.split('\t')

            metadata = {
                'collection': 'genemania',
                'type': network_group_name.lower(),
                'source': source,
                'name': network_name,
                'pubmed': int(pubmed_id) if not pubmed_id == '' else 0
            }

            if not args.warmstart or meta.find_one(dict(metadata.items() + [('status', 'success')])) is None:

                # old metadata records and their associated edges will be dropped after the new network is finished processing
                _ids = [result['_id'] for result in meta.find(metadata)]
                log.info('found %d matching network(s) that will be replaced: %s', len(_ids), ', '.join([str(_id) for _id in _ids]))

                set_status(metadata, 'parsing')
                _id = meta.insert_one(metadata).inserted_id

                metadata['count'] = load_network('http://genemania.org/data/current/Homo_sapiens/' + file_name, _id, args.batch)
                log.info('%s %s %s network has %d edges', metadata['source'], metadata['name'], metadata['type'], metadata['count'])

                set_status(metadata, 'success')
                meta.save(metadata)

                if len(_ids) > 0:
                    log.info('dropping old network metadata')
                    meta.delete_many({'_id': {'$in': _ids}})

        cleanup_edges()

        status.stop()

    return 0
Ejemplo n.º 15
0
def put_state(id, state):
    client = pymongo.MongoClient(app.mongodb_uri)
    project = client.nav.projects.find_one({'_id': ObjectId(id)})
    project['state'] = state
    set_status(project, 'updated')
    client.nav.projects.save(project)
Ejemplo n.º 16
0
def main():
    client = pymongo.MongoClient()
    db = client.networks

    # collection stores metadata about source networks
    meta = db.meta

    # collection stores edge data
    edges = db.edges

    # create index, if necessary
    create_edges_index()

    # get list of previously loaded networks to delete, if any
    _ids = [result['_id'] for result in meta.find({'collection': 'humannet'})]

    # From http://www.functionalnet.org/humannet/HumanNet.v1.evidence_code.txt:
    # File format: [gene1] [gene2] [CE-CC] [CE-CX] [CE-GT] [CE-LC] [CE-YH] [DM-PI] [HS-CC] [HS-CX] [HS-DC] [HS-GN] [HS-LC] [HS-MS] [HS-PG] [HS-YH] [SC-CC] [SC-CX] [SC-GT] [SC-LC] [SC-MS] [SC-TS] [SC-YH] [IntNet]
    # CE-CC = Co-citation of worm gene
    # CE-CX = Co-expression among worm genes
    # CE-GT = Worm genetic interactions
    # CE-LC = Literature curated worm protein physical interactions
    # CE-YH = High-throughput yeast 2-hybrid assays among worm genes
    # DM-PI = Fly protein physical interactions
    # HS-CC = Co-citation of human genes
    # HS-CX = Co-expression among human genes
    # HS-DC = Co-occurrence of domains among human proteins
    # HS-GN = Gene neighbourhoods of bacterial and archaeal orthologs of human genes
    # HS-LC = Literature curated human protein physical interactions
    # HS-MS = human protein complexes from affinity purification/mass spectrometry
    # HS-PG = Co-inheritance of bacterial and archaeal orthologs of human genes
    # HS-YH = High-throughput yeast 2-hybrid assays among human genes
    # SC-CC = Co-citation of yeast genes
    # SC-CX = Co-expression among yeast genes
    # SC-GT = Yeast genetic interactions
    # SC-LC = Literature curated yeast protein physical interactions
    # SC-MS = Yeast protein complexes from affinity purification/mass spectrometry
    # SC-TS = Yeast protein interactions inferred from tertiary structures of complexes
    # SC-YH = High-throughput yeast 2-hybrid assays among yeast genes
    # IntNet = Integrated network (HumanNet)

    columns = [
        'co-citation of worm gene',
        'co-expression among worm genes',
        'worm genetic interactions',
        'literature curated worm protein physical interactions',
        'high-throughput yeast 2-hybrid assays among worm genes',
        'fly protein physical interactions',
        'co-citation of human genes',
        'co-expression among human genes',
        'co-occurrence of domains among human proteins',
        'gene neighbourhoods of bacterial and archaeal orthologs of human genes',
        'literature curated human protein physical interactions',
        'human protein complexes from affinity purification/mass spectrometry',
        'co-inheritance of bacterial and archaeal orthologs of human genes',
        'high-throughput yeast 2-hybrid assays among human genes',
        'co-citation of yeast genes',
        'co-expression among yeast genes',
        'yeast genetic interactions',
        'literature curated yeast protein physical interactions',
        'yeast protein complexes from affinity purification/mass spectrometry',
        'yeast protein interactions inferred from tertiary structures of complexes',
        'high-throughput yeast 2-hybrid assays among yeast genes'
    ]

    metadata = {}

    for column in columns:
        m = {
            'collection': 'humannet',
            'name': column,
            'count': 0
        }
        set_status(m, 'parsing')
        m['_id'] = meta.insert_one(m).inserted_id
        metadata[column] = m

    url = 'http://www.functionalnet.org/humannet/HumanNet.v1.join.txt'
    log.info('reading network list from %s', url)
    r = requests.get(url)
    lines = list(r.iter_lines())

    count = 0

    iterator = parse(columns, metadata, lines)
    while True:
        records = [record for record in islice(iterator, 1000)]
        if len(records) > 0:
            name_to_id = genemania.id_lookup_table(set(it['source'] for it in records) | set(it['target'] for it in records))
            for record in records:
                source = name_to_id[record['source']]
                if source is None:
                    log.warning('unknown source %s', record['source'])
                record['source'] = source

                target = name_to_id[record['target']]
                if target is None:
                    log.warning('unknown target %s', record['target'])
                record['target'] = target

            records = [record for record in records if record['source'] is not None and record['target'] is not None]
            count += len(records)
            edges.insert_many(records)
            log.debug('inserted %d edges (%d total)', len(records), count)
        else:
            break

    for m in metadata.itervalues():
        set_status(m, 'success')
        meta.replace_one({'_id': m['_id']}, m)

    if len(_ids) > 0:
        log.info('dropping old network metadata')
        meta.delete_many({'_id': {'$in': _ids}})

    cleanup_edges()

    return 0