Ejemplo n.º 1
0
def cache(output, key, filename):
    '''
    Save output to filename.  Store filename in a cache (database) under key.
    '''
    util.dumpObject(output, filename)
    cacheSet(key, filename)
    return output
Ejemplo n.º 2
0
def computeBlastHits(queryFastaPath, subjectIndexPath, outPath, evalue, limitHits=MAX_HITS, workingDir='.', copyToWorking=False):
    '''
    queryFastaPath: location of fasta file of query sequences
    subjectIndexPath: location and name of blast-formatted indexes.
    evalue: a string or float representing the maximum evalue threshold of hits to get.
    outPath: location of file where blast hits are saved.
    workingDir: creates, uses, and removes a directory under workingDir.  
    copyToWorking: if True, copy query fasta path and subject index files to within the working directory and use the copies to blast.
      can improve performance if the working directory is on local disk and the files are on a slow network.
    Runs getBlastHits() and persists the hits to outPath.
    '''
    hitsMap = getBlastHits(queryFastaPath, subjectIndexPath, evalue, limitHits, workingDir, copyToWorking)
    util.dumpObject(hitsMap, outPath)
Ejemplo n.º 3
0
def computeBlastHits(queryFastaPath,
                     subjectIndexPath,
                     outPath,
                     evalue,
                     limitHits=MAX_HITS,
                     workingDir='.',
                     copyToWorking=False):
    '''
    queryFastaPath: location of fasta file of query sequences
    subjectIndexPath: location and name of blast-formatted indexes.
    evalue: a string or float representing the maximum evalue threshold of hits to get.
    outPath: location of file where blast hits are saved.
    workingDir: creates, uses, and removes a directory under workingDir.  
    copyToWorking: if True, copy query fasta path and subject index files to within the working directory and use the copies to blast.
      can improve performance if the working directory is on local disk and the files are on a slow network.
    Runs getBlastHits() and persists the hits to outPath.
    '''
    hitsMap = getBlastHits(queryFastaPath, subjectIndexPath, evalue, limitHits,
                           workingDir, copyToWorking)
    util.dumpObject(hitsMap, outPath)
Ejemplo n.º 4
0
def doOrthologyQuery(query_desc=None, tc_only=False,
                     db_cursor_read_buffer_size=DEFAULT_DB_CURSOR_READ_BUFFER_SIZE,
                     genome=None, limit_genomes=None, genomes=None,
                     seq_ids=None, divergence=None, evalue=None, go_term=False,
                     gene_name=False, outputPath=None, sortGenomes=True,
                     distance_lower_limit=None, distance_upper_limit=None,
                     release=None, dataset=None, **keywords):
    '''
    query_desc: string describing the query being run.  used by the web to let
    the user know what query was run to generate these results.  tc_only: if
    true only transitively closed clusters are returned.  seq_ids: a list of
    external_sequence_ids/accession numbers/GIs.  if not empty, it is used to
    restrict orthologs to only those that have either query_id or subject_id in
    seq_ids.  genome: get orthologs with a sequence from this genome
    limit_genomes: get orthologs with a sequence in a genome from
    limit_genomes.  genomes: get orthologs where both sequences are from
    genomes.  divergence: get orthologs calculated with this divergence
    threshold.  evalue: get orthologs calculated with this evalue threshold.
    go_term: if true, a mapping of seq ids to go terms is returned for the seq
    ids in the orthology results.  gene_name: if true, a mapping of seq ids to
    gene names is returned for the seq ids in the orthology results.
    outputPath: if not None, the return value is pickled to this path, not
    returned, and None is returned.  keywords: ignored.  here for historical
    compatibility reasons.  This function queries the database to get a list of
    orthologs and possibly gene names and go terms associated with those
    orthologs.  The orthologs are grouped into clusters (connected subgraphs).
    returns: a dict containing clusters, column headers, and possibly
    containing dicts for gene names, go terms, genome names, etc.
    '''

    tableDesc = {'query_desc': query_desc, 'release': release,
                 'dataset': dataset}

    distanceLowerLimitFilter, distanceUpperLimitFilter = makeLowerAndUpperLimitFilterFuncs(distance_lower_limit, distance_upper_limit)

    with roundup_db.connCM() as conn: 
        pairs = makePairsForGenomeParams(genome, limit_genomes, genomes)
        orthologsLists = []
        for pair in pairs:
            orthologs = roundup_db.getOrthologs(release, qdb=pair[0],
                                                sdb=pair[1],
                                                divergence=divergence,
                                                evalue=evalue, conn=conn)
            orthologsLists.append(orthologs)
        # orthologsLists is a list of lists of (query_sequence_id, subject_sequence_id, distance) tuples
        sequenceIds = set()
        for ortholog in itertools.chain(*orthologsLists): # orthologs:
            if distanceLowerLimitFilter(ortholog) and distanceUpperLimitFilter(ortholog):
                sequenceIds.add(ortholog[0])
                sequenceIds.add(ortholog[1])
        
        # get sequence data map from sequenceId to external_id, genome_id, gene_name.
        sequenceIds = list(sequenceIds)
        sequenceIdToSequenceDataMap = roundup_db.getSequenceIdToSequenceDataMap(
            release, sequenceIds, conn=conn)

        # cluster orthologs, limiting by seq_ids
        clusterer = clustering.EdgeClusterer(storeEdges=True)
        for ortholog in itertools.chain(*orthologsLists): # orthologs:
            if distanceLowerLimitFilter(ortholog) and distanceUpperLimitFilter(ortholog):
                # skip orthologs not in seq_ids
                if seq_ids:
                    if sequenceIdToSequenceDataMap[ortholog[0]][roundup_common.EXTERNAL_SEQUENCE_ID_KEY] not in seq_ids:
                        if sequenceIdToSequenceDataMap[ortholog[1]][roundup_common.EXTERNAL_SEQUENCE_ID_KEY] not in seq_ids:
                            continue
                clusterer.cluster(ortholog)
            pass

        # get genome database ids
        genomeIds = set([sequenceIdToSequenceDataMap[id][roundup_common.GENOME_ID_KEY] for id in sequenceIds])
        genomeIds = list(genomeIds)
        genomes = [roundup_db.getGenomeForId(release, id=id, conn=conn) for id
                   in genomeIds]
        # map genome to genomeId
        genomeToGenomeId = dict(zip(genomes, genomeIds))
        genomeIdToGenome = dict(zip(genomeIds, genomes))
        # sorted genomes, with genome keyword (if any) at front.
        if sortGenomes: genomes.sort() ### jike added 'if sortGenomes'
        if genome and genome in genomes:
            genomes.remove(genome)
            genomes.insert(0, genome)
        # map genomeId to column in result rows
        genomeIdToCol = dict([(genomeToGenomeId[genomes[col]], col) for col in range(len(genomes))])
        # genomeColToGenome = dict([(col, genomes[col]) for col in range(len(genomes))])
        # sort genomes and map genome ids to columns
        # sortedGenomeAndIdPairsList = zip(genomes, genomeIds)
        # sortedGenomeAndIdPairsList.sort()
        # genomeIdToCol = dict((sortedGenomeAndIdPairsList[col][1], col) for col in xrange(len(sortedGenomeAndIdPairsList)))
        
        # add each cluster to the cluster table
        # each row contains the genes for each genome in the correct column and the avg distance of the cluster edges.
        clusterTable = []
        clusterOrthologsList = []
        headerRow = genomes + ['Average Evolutionary Distance']
        for clusterId, cluster in clusterer.clusterIdToNodes.iteritems():
            clusterOrthologsList.append(clusterer.clusterIdToEdges[clusterId])
            numNodes = len(cluster)
            numClassesInCluster = len(set([sequenceIdToSequenceDataMap[gene][roundup_common.GENOME_ID_KEY] for gene in cluster]))
            # if tc_only, do not report non-transitively closed clusters or cluster-classes.
            if tc_only and (clusterer.clusterIdToNumEdges[clusterId] < ((numNodes * (numNodes - 1)) / 2) or numClassesInCluster != len(genomeIds)):
                continue
            # initialize lists for genes in each genome belonging to cluster
            clusterRow = [[]  for i in range(len(genomeIds))]
            # tack on avg dist to end of row.
            avgEdgeDist = clusterer.clusterIdToSumDistances[clusterId]/float(clusterer.clusterIdToNumEdges[clusterId])
            clusterRow.append('%.3f'%avgEdgeDist)
            try:
                for gene in cluster:
                    genomeId = sequenceIdToSequenceDataMap[gene][roundup_common.GENOME_ID_KEY]
                    clusterRow[genomeIdToCol[genomeId]].append(gene)
            except:
                logging.debug('gene: '+str(gene))
                logging.debug('genomes: '+str(genomes))
                logging.debug('genomeIds: '+str(genomeIds))
                logging.debug('sequenceIdToSequenceDataMap: '+str(sequenceIdToSequenceDataMap))
                logging.debug('genomeIdToCol: '+str(genomeIdToCol))
                raise
            clusterTable.append(clusterRow)
            
        tableDesc['type'] = 'clusters'
        tableDesc['headers'] = headerRow
        tableDesc['rows'] = clusterTable
        tableDesc['orthologs'] = clusterOrthologsList
        tableDesc['divergence'] = divergence
        tableDesc['evalue'] = evalue
        
        seqIdDataMap = dict([(id, {roundup_common.EXTERNAL_SEQUENCE_ID_KEY: sequenceIdToSequenceDataMap[id][roundup_common.EXTERNAL_SEQUENCE_ID_KEY],
                                   roundup_common.GENOME_ID_KEY: sequenceIdToSequenceDataMap[id][roundup_common.GENOME_ID_KEY]})
                             for id in sequenceIdToSequenceDataMap])
        if gene_name:
            tableDesc['has_gene_names'] = True
            for id in sequenceIdToSequenceDataMap:
                seqIdDataMap[id][roundup_common.GENE_NAME_KEY] = sequenceIdToSequenceDataMap[id][roundup_common.GENE_NAME_KEY]
        if go_term:
            tableDesc['has_go_terms'] = True
            (sequenceIdToTermsMap, termMap) = roundup_db.getSequenceIdToTermsMap(
                release, sequenceIds, conn=conn)
            for id in sequenceIdToSequenceDataMap:
                seqIdDataMap[id][roundup_common.TERMS_KEY] = sequenceIdToTermsMap.get(id, [])
            tableDesc['term_map'] = termMap
        tableDesc['seq_id_to_data_map'] = seqIdDataMap
        tableDesc['genome_id_to_genome_map'] = genomeIdToGenome
    if outputPath:
        util.dumpObject(tableDesc, outputPath)
        return None
    else:
        return tableDesc