Ejemplo n.º 1
0
 def __init__(self,
              dbFileName,        # file name to connect to
              verbosity=-1       # turn off all DP chatter
              ):
     Interface.__init__(self, dbFileName, verbosity)
     self.db = TrackMDB(verbosity=verbosity)
Ejemplo n.º 2
0
class ViewInterface(Interface):
    """Use this interface for querying the TrackM DB"""

    def __init__(self,
                 dbFileName,        # file name to connect to
                 verbosity=-1       # turn off all DP chatter
                 ):
        Interface.__init__(self, dbFileName, verbosity)
        self.db = TrackMDB(verbosity=verbosity)

    def getOutstandingPairs(self, aniCut=95., batch=None):
        """Get all the outstanding pairs based on batch and ani cutoff

        returns a list of tuples of type:
        (job_id, gPath1, gPath2, gid1, gid2, batch, ani_comp)
        """
        self.connect()
        # get all the outstanding pairs
        C = Condition("ani_comp", "=", "'-1'")
        if batch is not None:
            C = Condition(C, "and", Condition("batch", "=", "%s"%batch))
        all_pairs = self.select('pairs', ["pid", "gid_1", "gid_2", "ani_1", "ani_2", "batch"], C)
        gids = self.select('paths', ['gid', 'path'])
        self.disconnect()
        gids = dict(gids)
        vetted_pairs = []
        seen_gids = {}
        for pair in all_pairs:
            highest_ani = np.max([pair[3], pair[4]])
            if highest_ani <= aniCut:
                # pair is OK
                vetted_pairs.append((pair[0], gids[pair[1]], gids[pair[2]], pair[1], pair[2], pair[5], highest_ani))
        return vetted_pairs

    def getGids(self):
        """Get a list of all the GIDs"""
        self.connect()
        gids = self.select('paths', ['gid', 'path'])
        self.disconnect()
        # unwrap these guys
        self.gids = dict(zip([gid[0] for gid in gids], [True] * len(gids)))
        return self.gids

    def getGenomePaths(self):
        """Get a dict of type {gid -> path}"""
        self.connect()
        paths = self.select('paths', ["gid", "path"])
        self.disconnect()
        if len(paths) == 0:
            return {}
        return dict(paths)

    def getContigHeaders(self):
        """Get a dict of type {'header' -> cid}"""
        self.connect()
        headers = self.select('contigs', ['header', 'cid'])
        self.disconnect()
        if len(headers) == 0:
            return {}
        return dict(headers)

    def getHitData(self,
                   condition       # conditional statement for searching the db
                   ):
        """Populate a hit data instance with data that fits the given condition"""

        ret_HD = HitData()
        self.connect()
        
        rows = self.select('pairs', ['pid', 'ani_comp', 'gid_1', 'gid_2'], condition=condition)
        if len(rows) == 0:
            return ret_HD

        

        #print condition
        #select_str = "SELECT hits.pid, len_1, len_2 FROM hits INNER JOIN pairs ON pairs.pid = hits.pid WHERE %s" % str(condition)
        select_str = "SELECT hits.pid, len_1, len_2 FROM hits INNER JOIN pairs ON pairs.pid = hits.pid WHERE pairs.ani_comp <= 95 AND hits.ident >= 99 AND hits.len_1 >= 500 AND hits.len_2 >= 500"
        #select_str = "SELECT hits.pid, seqs.sqid, seqs.seq FROM hits INNER JOIN seqs ON seqs.sqid = hits.sqid_1 INNER JOIN pairs ON pairs.pid = hits.pid WHERE hits.ident >= 99 AND hits.len_1 >= 500 AND hits.len_2 >= 500"
        #select_str = "SELECT hits.pid, seqs.sqid, seqs.seq FROM hits INNER JOIN seqs ON seqs.sqid = hits.sqid_2 INNER JOIN pairs ON pairs.pid = hits.pid WHERE hits.ident >= 99 AND hits.len_1 >= 500 AND hits.len_2 >= 500"
        
        print select_str
        cur = self.db.getCursor()
        cur.execute(select_str)
        hits = cur.fetchall()
        
        self.disconnect()
        # make a tmp dict of pid -> cidTuple

        pid_lookup = {}
        for row in rows:
            key = ret_HD.makeKey(row[2],row[3])
            ret_HD.makeLookUp(row[0],key)
            #pid_lookup[row[0]] = key
            ret_HD.workingIds[key[0]] = False               # we've seen these guys now
            ret_HD.workingIds[key[1]] = False
            ret_HD.addHit(key[0], key[1], 0)                # cal with length 0 to initialise an empty hit between these two genomes
            ret_HD.addANIIdentity(key[0], key[1], row[1])   # add the identity for this pair
        """
        # now parse the seqs
        for hit in hits:
            seq = hit[2]                           # transferred sequence
            sqid = hit[1]
            pid = hit[0]
            key = pid_lookup[hit[0]]               # get the key (gid1, gid2)
            print ">%s_%s" % (pid,sqid) 
            print seq
        """ 
            
        
        
        # now parse the hits
        print "No. of rows in hits data %d" % len(hits)
        for hit in hits:
            #print hit
            length = np.mean(hit[1:])              # mean length used here
            key = ret_HD.pidLookUp[hit[0]]
            #key = pid_lookup[hit[0]]               # get the key (gid1, gid2)
            ret_HD.addHit(key[0], key[1], length)  # add the length (increments hit count)
            ret_HD.workingIds[key[0]] = True       # we've seen these guys now
            ret_HD.workingIds[key[1]] = True
        
        


        return ret_HD