Ejemplo n.º 1
0
def getRawResults(params):
    '''
    params: a 4-tuple of roundup params like (qdb, sdb, div, evalue).
    returns: a string containing all the orthologs for the params. in the form external query sequence id, external subject sequence id, and distance
      or None if the results do not exist.
    '''
    qdb, sdb, div, evalue = params
    pair = roundup_common.makePair(qdb, sdb)
    # get orthologs from db
    orthologs = roundup_db.getOrthologs(release=webconfig.CURRENT_RELEASE,
                                        qdb=pair[0], sdb=pair[1],
                                        divergence=div, evalue=evalue)
    # get a map to external sequence ids
    sequenceIds = set()
    for ortholog in orthologs:
        sequenceIds.add(ortholog[0])
        sequenceIds.add(ortholog[1])
    sequenceIds = list(sequenceIds)
    sequenceIdToSequenceDataMap = roundup_db.getSequenceIdToSequenceDataMap(
        release=webconfig.CURRENT_RELEASE, sequenceIds=sequenceIds)

    # format orthologs for download by mapping to external sequence ids.
    results = None
    if orthologs:
        results = ''.join(['{}\t{}\t{}\n'.format(sequenceIdToSequenceDataMap[qid][roundup_common.EXTERNAL_SEQUENCE_ID_KEY],
                                                 sequenceIdToSequenceDataMap[sid][roundup_common.EXTERNAL_SEQUENCE_ID_KEY],
                                                 dist) for qid, sid, dist in orthologs])
    return results
Ejemplo n.º 2
0
def getOrthData(params):
    qdb, sdb, div, evalue = params
    pair = roundup_common.makePair(qdb, sdb)
    # get orthologs from db
    dbOrthologs = roundup_db.getOrthologs(release=webconfig.CURRENT_RELEASE,
                                          qdb=pair[0], sdb=pair[1],
                                          divergence=div, evalue=evalue)
    # get a map to external sequence ids
    sequenceIds = set()
    for ortholog in dbOrthologs:
        sequenceIds.add(ortholog[0])
        sequenceIds.add(ortholog[1])
    sequenceIds = list(sequenceIds)
    sequenceIdToSequenceDataMap = roundup_db.getSequenceIdToSequenceDataMap(
        release=webconfig.CURRENT_RELEASE, sequenceIds=sequenceIds)
    # 
    orthologs = [(sequenceIdToSequenceDataMap[qid][roundup_common.EXTERNAL_SEQUENCE_ID_KEY],
                  sequenceIdToSequenceDataMap[sid][roundup_common.EXTERNAL_SEQUENCE_ID_KEY],
                  str(dist)) for qid, sid, dist in dbOrthologs]
    return (params, orthologs)
Ejemplo n.º 3
0
def doOrthologyQuery(query_desc=None, tc_only=False,
                     db_cursor_read_buffer_size=DEFAULT_DB_CURSOR_READ_BUFFER_SIZE,
                     genome=None, limit_genomes=None, genomes=None,
                     seq_ids=None, divergence=None, evalue=None, go_term=False,
                     gene_name=False, outputPath=None, sortGenomes=True,
                     distance_lower_limit=None, distance_upper_limit=None,
                     release=None, dataset=None, **keywords):
    '''
    query_desc: string describing the query being run.  used by the web to let
    the user know what query was run to generate these results.  tc_only: if
    true only transitively closed clusters are returned.  seq_ids: a list of
    external_sequence_ids/accession numbers/GIs.  if not empty, it is used to
    restrict orthologs to only those that have either query_id or subject_id in
    seq_ids.  genome: get orthologs with a sequence from this genome
    limit_genomes: get orthologs with a sequence in a genome from
    limit_genomes.  genomes: get orthologs where both sequences are from
    genomes.  divergence: get orthologs calculated with this divergence
    threshold.  evalue: get orthologs calculated with this evalue threshold.
    go_term: if true, a mapping of seq ids to go terms is returned for the seq
    ids in the orthology results.  gene_name: if true, a mapping of seq ids to
    gene names is returned for the seq ids in the orthology results.
    outputPath: if not None, the return value is pickled to this path, not
    returned, and None is returned.  keywords: ignored.  here for historical
    compatibility reasons.  This function queries the database to get a list of
    orthologs and possibly gene names and go terms associated with those
    orthologs.  The orthologs are grouped into clusters (connected subgraphs).
    returns: a dict containing clusters, column headers, and possibly
    containing dicts for gene names, go terms, genome names, etc.
    '''

    tableDesc = {'query_desc': query_desc, 'release': release,
                 'dataset': dataset}

    distanceLowerLimitFilter, distanceUpperLimitFilter = makeLowerAndUpperLimitFilterFuncs(distance_lower_limit, distance_upper_limit)

    with roundup_db.connCM() as conn: 
        pairs = makePairsForGenomeParams(genome, limit_genomes, genomes)
        orthologsLists = []
        for pair in pairs:
            orthologs = roundup_db.getOrthologs(release, qdb=pair[0],
                                                sdb=pair[1],
                                                divergence=divergence,
                                                evalue=evalue, conn=conn)
            orthologsLists.append(orthologs)
        # orthologsLists is a list of lists of (query_sequence_id, subject_sequence_id, distance) tuples
        sequenceIds = set()
        for ortholog in itertools.chain(*orthologsLists): # orthologs:
            if distanceLowerLimitFilter(ortholog) and distanceUpperLimitFilter(ortholog):
                sequenceIds.add(ortholog[0])
                sequenceIds.add(ortholog[1])
        
        # get sequence data map from sequenceId to external_id, genome_id, gene_name.
        sequenceIds = list(sequenceIds)
        sequenceIdToSequenceDataMap = roundup_db.getSequenceIdToSequenceDataMap(
            release, sequenceIds, conn=conn)

        # cluster orthologs, limiting by seq_ids
        clusterer = clustering.EdgeClusterer(storeEdges=True)
        for ortholog in itertools.chain(*orthologsLists): # orthologs:
            if distanceLowerLimitFilter(ortholog) and distanceUpperLimitFilter(ortholog):
                # skip orthologs not in seq_ids
                if seq_ids:
                    if sequenceIdToSequenceDataMap[ortholog[0]][roundup_common.EXTERNAL_SEQUENCE_ID_KEY] not in seq_ids:
                        if sequenceIdToSequenceDataMap[ortholog[1]][roundup_common.EXTERNAL_SEQUENCE_ID_KEY] not in seq_ids:
                            continue
                clusterer.cluster(ortholog)
            pass

        # get genome database ids
        genomeIds = set([sequenceIdToSequenceDataMap[id][roundup_common.GENOME_ID_KEY] for id in sequenceIds])
        genomeIds = list(genomeIds)
        genomes = [roundup_db.getGenomeForId(release, id=id, conn=conn) for id
                   in genomeIds]
        # map genome to genomeId
        genomeToGenomeId = dict(zip(genomes, genomeIds))
        genomeIdToGenome = dict(zip(genomeIds, genomes))
        # sorted genomes, with genome keyword (if any) at front.
        if sortGenomes: genomes.sort() ### jike added 'if sortGenomes'
        if genome and genome in genomes:
            genomes.remove(genome)
            genomes.insert(0, genome)
        # map genomeId to column in result rows
        genomeIdToCol = dict([(genomeToGenomeId[genomes[col]], col) for col in range(len(genomes))])
        # genomeColToGenome = dict([(col, genomes[col]) for col in range(len(genomes))])
        # sort genomes and map genome ids to columns
        # sortedGenomeAndIdPairsList = zip(genomes, genomeIds)
        # sortedGenomeAndIdPairsList.sort()
        # genomeIdToCol = dict((sortedGenomeAndIdPairsList[col][1], col) for col in xrange(len(sortedGenomeAndIdPairsList)))
        
        # add each cluster to the cluster table
        # each row contains the genes for each genome in the correct column and the avg distance of the cluster edges.
        clusterTable = []
        clusterOrthologsList = []
        headerRow = genomes + ['Average Evolutionary Distance']
        for clusterId, cluster in clusterer.clusterIdToNodes.iteritems():
            clusterOrthologsList.append(clusterer.clusterIdToEdges[clusterId])
            numNodes = len(cluster)
            numClassesInCluster = len(set([sequenceIdToSequenceDataMap[gene][roundup_common.GENOME_ID_KEY] for gene in cluster]))
            # if tc_only, do not report non-transitively closed clusters or cluster-classes.
            if tc_only and (clusterer.clusterIdToNumEdges[clusterId] < ((numNodes * (numNodes - 1)) / 2) or numClassesInCluster != len(genomeIds)):
                continue
            # initialize lists for genes in each genome belonging to cluster
            clusterRow = [[]  for i in range(len(genomeIds))]
            # tack on avg dist to end of row.
            avgEdgeDist = clusterer.clusterIdToSumDistances[clusterId]/float(clusterer.clusterIdToNumEdges[clusterId])
            clusterRow.append('%.3f'%avgEdgeDist)
            try:
                for gene in cluster:
                    genomeId = sequenceIdToSequenceDataMap[gene][roundup_common.GENOME_ID_KEY]
                    clusterRow[genomeIdToCol[genomeId]].append(gene)
            except:
                logging.debug('gene: '+str(gene))
                logging.debug('genomes: '+str(genomes))
                logging.debug('genomeIds: '+str(genomeIds))
                logging.debug('sequenceIdToSequenceDataMap: '+str(sequenceIdToSequenceDataMap))
                logging.debug('genomeIdToCol: '+str(genomeIdToCol))
                raise
            clusterTable.append(clusterRow)
            
        tableDesc['type'] = 'clusters'
        tableDesc['headers'] = headerRow
        tableDesc['rows'] = clusterTable
        tableDesc['orthologs'] = clusterOrthologsList
        tableDesc['divergence'] = divergence
        tableDesc['evalue'] = evalue
        
        seqIdDataMap = dict([(id, {roundup_common.EXTERNAL_SEQUENCE_ID_KEY: sequenceIdToSequenceDataMap[id][roundup_common.EXTERNAL_SEQUENCE_ID_KEY],
                                   roundup_common.GENOME_ID_KEY: sequenceIdToSequenceDataMap[id][roundup_common.GENOME_ID_KEY]})
                             for id in sequenceIdToSequenceDataMap])
        if gene_name:
            tableDesc['has_gene_names'] = True
            for id in sequenceIdToSequenceDataMap:
                seqIdDataMap[id][roundup_common.GENE_NAME_KEY] = sequenceIdToSequenceDataMap[id][roundup_common.GENE_NAME_KEY]
        if go_term:
            tableDesc['has_go_terms'] = True
            (sequenceIdToTermsMap, termMap) = roundup_db.getSequenceIdToTermsMap(
                release, sequenceIds, conn=conn)
            for id in sequenceIdToSequenceDataMap:
                seqIdDataMap[id][roundup_common.TERMS_KEY] = sequenceIdToTermsMap.get(id, [])
            tableDesc['term_map'] = termMap
        tableDesc['seq_id_to_data_map'] = seqIdDataMap
        tableDesc['genome_id_to_genome_map'] = genomeIdToGenome
    if outputPath:
        util.dumpObject(tableDesc, outputPath)
        return None
    else:
        return tableDesc