예제 #1
0
    def Split( self, max_gap_length ):
        """split each alignment into several,
        if there is a gap longer than min_gap_length. This is necessary, as
        structural domains can be discontinuos.
        """

        statement = """
        SELECT nid, start, end, rep_ali,
        domain_id, domain_from, domain_to, domain_ali,
        family
        %s
        FROM %s""" % (self.GetAdditionalInfo(), self.name )

        tempfile = os.tempnam(Pairsdb.PATH_LOAD, "scmp")
        
        outfile = open( tempfile, "w" )
        
        domains = self.Execute(statement).fetchall()

        for domain in domains:
            (nid, start, end, rep_ali,
             domain_id, domain_from, domain_to, domain_ali,
             family) = domain[:9]

            map_rep2domains = alignlib.makeAlignataVector()

            alignlib.fillAlignataCompressed( map_rep2domains, start, rep_ali, domain_from, domain_ali)

            val = alignlib.splitAlignata( map_rep2domains, max_gap_length)
            
            fragments = map( lambda x: alignlib.AlignataPtr(x), val)
 
            ## now write each fragment to the output
            for map_rep2domain in fragments:
                ## so that the object gets deleted, once it goes out of scope
                map_rep2domain.thisown = 1
                                           
                start = map_rep2domain.getRowFrom()
                end = map_rep2domain.getRowTo()
                domain_from = map_rep2domain.getColFrom()
                domain_to = map_rep2domain.getColTo()

                (rep_ali, domain_ali) = alignlib.writeAlignataCompressed( map_rep2domain)

                self.WriteLine( outfile,
                                nid, 
                                map_rep2domain,
                                domain_id,
                                family,
                                domain[9:])
                    

        outfile.close()

        self.Drop()
        self.Create()
        self.Load( tempfile )
예제 #2
0
    def Split( self, max_gap_length ):
        """split each alignment into several,
        if there is a gap longer than min_gap_length. This is necessary, as
        structural domains can be discontinuos.
        """

        statement = """
        SELECT nid, start, end, rep_ali,
        domain_id, domain_from, domain_to, domain_ali,
        family
        %s
        FROM %s""" % (self.GetAdditionalInfo(), self.name )

        tempfile = os.tempnam(Pairsdb.PATH_LOAD, "scmp")
        
        outfile = open( tempfile, "w" )
        
        domains = self.Execute(statement).fetchall()

        for domain in domains:
            (nid, start, end, rep_ali,
             domain_id, domain_from, domain_to, domain_ali,
             family) = domain[:9]

            map_rep2domains = alignlib.makeAlignataVector()

            alignlib.fillAlignataCompressed( map_rep2domains, start, rep_ali, domain_from, domain_ali)

            val = alignlib.splitAlignata( map_rep2domains, max_gap_length)
            
            fragments = map( lambda x: alignlib.AlignataPtr(x), val)
 
            ## now write each fragment to the output
            for map_rep2domain in fragments:
                ## so that the object gets deleted, once it goes out of scope
                map_rep2domain.thisown = 1
                                           
                start = map_rep2domain.getRowFrom()
                end = map_rep2domain.getRowTo()
                domain_from = map_rep2domain.getColFrom()
                domain_to = map_rep2domain.getColTo()

                (rep_ali, domain_ali) = alignlib.writeAlignataCompressed( map_rep2domain)

                self.WriteLine( outfile,
                                nid, 
                                map_rep2domain,
                                domain_id,
                                family,
                                domain[9:])
                    

        outfile.close()

        self.Drop()
        self.Create()
        self.Load( tempfile )
예제 #3
0
    def GetLinks(self, query_nid, query_from, query_to, query_ali, sbjct_nid,
                 sbjct_from, sbjct_to, sbjct_ali):
        """returns all possible links between link split into domains.
        """

        if self.mLogLevel >= 2:
            print "# processing", query_nid, sbjct_nid, query_from, query_to, sbjct_from, sbjct_to
            sys.stdout.flush()

        map_query2sbjct = alignlib.makeAlignataVector()

        alignlib.fillAlignataCompressed(map_query2sbjct, query_from, query_ali,
                                        sbjct_from, sbjct_ali)

        # iterate over query
        for query_domain_from, query_domain_to, query_family in self.mDomains[
                query_nid]:

            # check if overlap
            overlap = min(query_to, query_domain_to) - max(
                query_from, query_domain_from) + 1
            if overlap <= self.mMinOverlapResidues: continue

            # check for overlap with domains in sbjct
            for sbjct_domain_from, sbjct_domain_to, sbjct_family in self.mDomains[
                    sbjct_nid]:

                overlap = min(sbjct_to, sbjct_domain_to) - max(
                    sbjct_from, sbjct_domain_from) + 1
                if overlap < self.mMinOverlapResidues: continue

                map_new_query2sbjct = alignlib.makeAlignataVector()
                alignlib.copyAlignata(map_new_query2sbjct, map_query2sbjct,
                                      query_domain_from, query_domain_to,
                                      sbjct_domain_from, sbjct_domain_to)

                if map_new_query2sbjct.getLength() > 0:

                    row_ali, col_ali = alignlib.writeAlignataCompressed(
                        map_new_query2sbjct)

                    print string.join(
                        ("%s_%s_%s" %
                         (query_nid, query_domain_from, query_domain_to),
                         "%s_%s_%s" %
                         (sbjct_nid, sbjct_domain_from, sbjct_domain_to), "0",
                         str(map_new_query2sbjct.getRowFrom()),
                         str(map_new_query2sbjct.getRowTo()), row_ali,
                         str(map_new_query2sbjct.getColFrom()),
                         str(map_new_query2sbjct.getColTo()), col_ali), "\t")
    def GetLinks( self, query_nid, query_from, query_to, query_ali, sbjct_nid, sbjct_from, sbjct_to, sbjct_ali):
        """returns all possible links between link split into domains.
        """

        if self.mLogLevel >= 2:
            print "# processing", query_nid, sbjct_nid, query_from, query_to, sbjct_from, sbjct_to
            sys.stdout.flush()
            
        map_query2sbjct = alignlib.makeAlignataVector()

        alignlib.fillAlignataCompressed( map_query2sbjct, query_from, query_ali, sbjct_from, sbjct_ali )

        # iterate over query
        for query_domain_from, query_domain_to, query_family in self.mDomains[query_nid]:

            # check if overlap
            overlap = min(query_to, query_domain_to)-max(query_from, query_domain_from) + 1
            if overlap <= self.mMinOverlapResidues: continue

            # check for overlap with domains in sbjct
            for sbjct_domain_from, sbjct_domain_to, sbjct_family in self.mDomains[sbjct_nid]:
                
                overlap = min(sbjct_to, sbjct_domain_to)-max(sbjct_from, sbjct_domain_from) + 1
                if overlap < self.mMinOverlapResidues: continue

                map_new_query2sbjct = alignlib.makeAlignataVector()
                alignlib.copyAlignata( map_new_query2sbjct, map_query2sbjct,
                                       query_domain_from, query_domain_to,
                                       sbjct_domain_from, sbjct_domain_to)

                if map_new_query2sbjct.getLength() > 0:

                    row_ali, col_ali = alignlib.writeAlignataCompressed(  map_new_query2sbjct )
                    
                    print string.join( ("%s_%s_%s" % (query_nid, query_domain_from, query_domain_to),
                                        "%s_%s_%s" % (sbjct_nid, sbjct_domain_from, sbjct_domain_to),
                                        "0",
                                        str(map_new_query2sbjct.getRowFrom()),
                                        str(map_new_query2sbjct.getRowTo()),
                                        row_ali,
                                        str(map_new_query2sbjct.getColFrom()),
                                        str(map_new_query2sbjct.getColTo()),
                                        col_ali), "\t")
예제 #5
0
    def MapAndAddDomains( self, domains):
        """Map a domain using an alignment and write to outputfile for 
        loading into table.

        map info from member to rep

        domains contains the following information:
        nid,                        # nid of new rep
        info_mem_from, info_mem_to, info_mem_ali,      # information to be mapped on mem
        info_from, info_to, info_ali,   # information to be mapped on other quantity
        start, end, rep_ali,      # map between mem and new rep, rep-part
        mem_from, mem_to, mem_ali,       # map between mem and new rep, mem-part
        ...info-fields
        """

        temp_filename = os.tempnam( Pairsdb.PATH_LOAD, "clup" )
        
        failed = 0
        
        outfile = open(temp_filename, "w")

        for domain in domains:
            
            ( nid,
              info_mem_from, info_mem_to, info_mem_ali,
              info_from, info_to, info_ali,
              start, end, rep_ali,
              mem_from, mem_to, mem_ali,
              domain_id, family) = domain[:15]
            
            # set does not work (for example 1b77 for ddd, obscure error, probably
            # due to destructors?
            map_info_mem2info = alignlib.makeAlignataVector()
            
            alignlib.fillAlignataCompressed( map_info_mem2info,
                                             info_mem_from, info_mem_ali.tostring(),
                                             info_from, info_ali.tostring() )
            
            map_rep2mem = alignlib.makeAlignataVector()
            alignlib.fillAlignataCompressed( map_rep2mem,
                                             start, rep_ali.tostring(),
                                             mem_from, mem_ali.tostring() )
            
            map_rep2info = alignlib.makeAlignataVector()
            alignlib.combineAlignata( map_rep2info, map_rep2mem, map_info_mem2info, alignlib.CR) 

            if map_rep2info.getLength() == 0:
                if self.mLogLevel >= 2:
                    print "----> mapping failed for", domain
                    sys.stdout.flush()
                failed += 1
                
            else:
                self.WriteLine( outfile, nid, map_rep2info, domain_id, family,
                                domain[15:])
        
        outfile.close()
        
        self.Load( temp_filename )
        
        if self.mLogLevel >= 1:
            print "--> mapping failed for %i pairings." % failed
            sys.stdout.flush()

        return failed
예제 #6
0
    def MapAndAddDomains( self, domains):
        """Map a domain using an alignment and write to outputfile for 
        loading into table.

        map info from member to rep

        domains contains the following information:
        nid,                        # nid of new rep
        info_mem_from, info_mem_to, info_mem_ali,      # information to be mapped on mem
        info_from, info_to, info_ali,   # information to be mapped on other quantity
        start, end, rep_ali,      # map between mem and new rep, rep-part
        mem_from, mem_to, mem_ali,       # map between mem and new rep, mem-part
        ...info-fields
        """

        temp_filename = os.tempnam( Pairsdb.PATH_LOAD, "clup" )
        
        failed = 0
        
        outfile = open(temp_filename, "w")

        for domain in domains:
            
            ( nid,
              info_mem_from, info_mem_to, info_mem_ali,
              info_from, info_to, info_ali,
              start, end, rep_ali,
              mem_from, mem_to, mem_ali,
              domain_id, family) = domain[:15]
            
            # set does not work (for example 1b77 for ddd, obscure error, probably
            # due to destructors?
            map_info_mem2info = alignlib.makeAlignataVector()
            
            alignlib.fillAlignataCompressed( map_info_mem2info,
                                             info_mem_from, info_mem_ali.tostring(),
                                             info_from, info_ali.tostring() )
            
            map_rep2mem = alignlib.makeAlignataVector()
            alignlib.fillAlignataCompressed( map_rep2mem,
                                             start, rep_ali.tostring(),
                                             mem_from, mem_ali.tostring() )
            
            map_rep2info = alignlib.makeAlignataVector()
            alignlib.combineAlignata( map_rep2info, map_rep2mem, map_info_mem2info, alignlib.CR) 

            if map_rep2info.getLength() == 0:
                if self.mLogLevel >= 2:
                    print "----> mapping failed for", domain
                    sys.stdout.flush()
                failed += 1
                
            else:
                self.WriteLine( outfile, nid, map_rep2info, domain_id, family,
                                domain[15:])
        
        outfile.close()
        
        self.Load( temp_filename )
        
        if self.mLogLevel >= 1:
            print "--> mapping failed for %i pairings." % failed
            sys.stdout.flush()

        return failed
예제 #7
0
파일: blast2fasta.py 프로젝트: siping/cgat
    ali = alignlib.makeAlignataVector()
    
    for line in sys.stdin:
        
        if line[0] == "#": continue

        link.Read( line )
        ninput += 1

        if link.mQueryToken not in sequences or link.mSbjctToken not in sequences:
            nskipped += 1
            continue
        
        ali.Clear()
        alignlib.fillAlignataCompressed( ali, link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli )


        result = alignlib.writePairAlignment( sequences[link.mQueryToken], sequences[link.mSbjctToken], ali ).split("\n")

        if len(result) != 3:
            nfailed += 1

        if options.format == "fasta":
            print ">%s %i-%i\n%s\n>%s %i-%i\n%s\n" %\
                  (link.mQueryToken, link.mQueryFrom, link.mQueryTo, result[0].split("\t")[1],
                   link.mSbjctToken, link.mSbjctFrom, link.mSbjctTo, result[1].split("\t")[1] )
            
        noutput += 1
        
    print "# ninput=%i, noutput=%i, nskipped=%i, nfailed=%i" % (ninput, noutput, nskipped, nfailed)
예제 #8
0
            identifiers.append( sbjct_nid )
        else:
            sbjct_sequence, sbjct_nid = GetSequence( tbl_nrdb, sbjct_nid)
            (identifier, description ) = tbl_nrdb.GetAnnotationFromNid( sbjct_nid )
            sbjct_alignatum = pairsdblib.makeAlignatumNeighbour( sbjct_sequence,
                                                                 identifier,
                                                                 description,
                                                                 sbjct_nid,
                                                                 string.atof(0),
                                                                 string.atof(0),
                                                                 string.atof(score))


        ## build alignment between mali and new sequence
        alignlib.fillAlignataCompressed( map_query2sbjct,
                                         string.atoi(query_from), query_ali,
                                         string.atoi(sbjct_from), sbjct_ali)
                                                 
        if len(sbjct_sequence) < map_query2sbjct.getColTo():
            print "entry %i skipped, because sequence length (%i) less than last residue aligned (%i)!!!" %\
                  (sbjct_nid, len(sbjct_sequence), map_query2sbjct.getColTo())
            continue

        mali.addAlignatum( sbjct_alignatum, map_query2sbjct, 1, 0, 1, 1, 0 )
        sbjct_alignatum.thisown = 0

    consensus = mali.getConsensusString()

    if not param_plain:
        renderer = alignlib.makeRendererMView( consensus )
        mali.registerRenderer(renderer)
예제 #9
0
def BuildBLASTMatrix( dbhandle,
                      query_nid,
                      resolution = 1.0,
                      table_name = None,
                      combine_repeats = None,
                      max_evalue = None,
                      min_evalue = None,
                      residue_level = None,
                      parser = None,
                      add_self = None):
    """build matrix based on BLAST alignments to query_nid.

    matrix of size N*M
    N: number of neighbours
    M: length of query (scaled with resolution)

    alignments are truncated.

    the query is included in the matrix.
    
    if combine_repeats is set, multiple alignments between the query and a sbjct will
    be entered into the same row.

    if residue_level is set, entries are added on the residue level. The resolution parameter
    is ignored.
    """

    if residue_level:
        query_length = Table_nrdb(dbhandle).GetLength( query_nid )
    else:
        query_length = int( math.floor( float(Table_nrdb(dbhandle).GetLength( query_nid )) / float(resolution)))

    tbl_pairsdb_90x90 = TablePairsdbNeighbours( dbhandle )
    if table_name:
        tbl_pairsdb_90x90.SetName( table_name )

    neighbours = tbl_pairsdb_90x90.GetNeighbours( query_nid,
                                                  sort_order = 3,
                                                  skip_query = add_self,
                                                  min_evalue = min_evalue,
                                                  max_evalue = max_evalue)
    nindex = {}
    
    nneighbours = 0
    if combine_repeats:
        for neighbour in neighbours:
            (query_from, query_to, query_ali,
             sbjct_nid, sbjct_from, sbjct_to, sbjct_ali, score, pide, evalue) = neighbour
            if not nindex.has_key(sbjct_nid):
                nindex[sbjct_nid] = nneighbours
                nneighbours += 1
    else:
        nneighbours = len(neighbours)

    if add_self:
        nneighbours += 1

    matrix = numpy.zeros( (nneighbours, query_length), numpy.int)    

    if add_self:
        matrix[0, 0:query_length] = 1
        row = 1
    else:
        row = 0
        
    for neighbour in neighbours:

        (query_from, query_to, query_ali,
         sbjct_nid, sbjct_from, sbjct_to, sbjct_ali, score, pide, evalue) = neighbour

        if combine_repeats:
            use_row = nindex[sbjct_nid]
        else:
            use_row = row
            row += 1

        if residue_level:
            map_sbjct2query = alignlib.makeAlignataVector()
            alignlib.fillAlignataCompressed( map_sbjct2query, sbjct_from, sbjct_ali, query_from, query_ali )
            if parser:
                parser( map_sbjct2query )
                
            for x in range(sbjct_from, sbjct_to + 1):
                y = map_sbjct2query.mapRowToCol(x)
                if y:
                    try:
                        matrix[use_row, y-1] = 1
                    except IndexError:
                        print "IndexError in ", query_nid, sbjct_nid, x, y-1, query_length
        else:
            yfrom = int(math.floor(query_from/resolution))
            yto   = int(math.floor(query_to/resolution)) 
            matrix[use_row, yfrom:yto] = 1
            
    return matrix
예제 #10
0
def BuildBLASTMatrix(
    dbhandle,
    query_nid,
    resolution=1.0,
    table_name=None,
    combine_repeats=None,
    max_evalue=None,
    min_evalue=None,
    residue_level=None,
    parser=None,
    add_self=None,
):
    """build matrix based on BLAST alignments to query_nid.

    matrix of size N*M
    N: number of neighbours
    M: length of query (scaled with resolution)

    alignments are truncated.

    the query is included in the matrix.
    
    if combine_repeats is set, multiple alignments between the query and a sbjct will
    be entered into the same row.

    if residue_level is set, entries are added on the residue level. The resolution parameter
    is ignored.
    """

    if residue_level:
        query_length = Table_nrdb(dbhandle).GetLength(query_nid)
    else:
        query_length = int(math.floor(float(Table_nrdb(dbhandle).GetLength(query_nid)) / float(resolution)))

    tbl_pairsdb_90x90 = TablePairsdbNeighbours(dbhandle)
    if table_name:
        tbl_pairsdb_90x90.SetName(table_name)

    neighbours = tbl_pairsdb_90x90.GetNeighbours(
        query_nid, sort_order=3, skip_query=add_self, min_evalue=min_evalue, max_evalue=max_evalue
    )
    nindex = {}

    nneighbours = 0
    if combine_repeats:
        for neighbour in neighbours:
            (
                query_from,
                query_to,
                query_ali,
                sbjct_nid,
                sbjct_from,
                sbjct_to,
                sbjct_ali,
                score,
                pide,
                evalue,
            ) = neighbour
            if not nindex.has_key(sbjct_nid):
                nindex[sbjct_nid] = nneighbours
                nneighbours += 1
    else:
        nneighbours = len(neighbours)

    if add_self:
        nneighbours += 1

    matrix = numpy.zeros((nneighbours, query_length), numpy.int)

    if add_self:
        matrix[0, 0:query_length] = 1
        row = 1
    else:
        row = 0

    for neighbour in neighbours:

        (query_from, query_to, query_ali, sbjct_nid, sbjct_from, sbjct_to, sbjct_ali, score, pide, evalue) = neighbour

        if combine_repeats:
            use_row = nindex[sbjct_nid]
        else:
            use_row = row
            row += 1

        if residue_level:
            map_sbjct2query = alignlib.makeAlignataVector()
            alignlib.fillAlignataCompressed(map_sbjct2query, sbjct_from, sbjct_ali, query_from, query_ali)
            if parser:
                parser(map_sbjct2query)

            for x in range(sbjct_from, sbjct_to + 1):
                y = map_sbjct2query.mapRowToCol(x)
                if y:
                    try:
                        matrix[use_row, y - 1] = 1
                    except IndexError:
                        print "IndexError in ", query_nid, sbjct_nid, x, y - 1, query_length
        else:
            yfrom = int(math.floor(query_from / resolution))
            yto = int(math.floor(query_to / resolution))
            matrix[use_row, yfrom:yto] = 1

    return matrix