def LowSimilarityRegionCodingBlockGraph2GeneTreeGraph(cbg): """ Convert LowSimilarityRegion 2 GeneTree @attention: function just converts, error check is not performed here! @type cbg: LowSimilarityRegion @param cbg: LowSimilarityRegion instance @rtype: GeneTreeGraph @return: GeneTreeGraph instance """ gtg = GeneTreeGraph() cbgnode2orgnode = {} for node in cbg.get_nodes(): org = cbg._organism_from_node(node) gtg.add_node(org) # add node/org combi to mapping dict cbgnode2orgnode[node] = org # now add all the edges omsr = cbg.overall_minimal_spanning_range() for (n1, n2) in cbg.pairwisecrosscombinations_node(): if cbg.has_edge(n1, n2): # get pacbp(orf) object pacbps = cbg.get_pacbps_by_nodes(node1=n1, node2=n2) if pacbps: identityscore = pacbps[0].identityscore else: # this edge has no pacbp in the lsrCBG -> happens often identityscore = 0.0 else: # this edge is absent in the lsrCBG! identityscore = 0.0 # get organism identifyers from node and add edge o1, o2 = cbgnode2orgnode[n1], cbgnode2orgnode[n2] gtg.add_edge(o1, o2, wt=identityscore) # check if the graph is saturated (complete) # if not (organism/node/orf missing), add this as a zero-wt edge gtg.makecompletegraph(wt=0.0) # and return this new genetree graph return gtg
def LowSimilarityRegionCodingBlockGraph2GeneTreeGraph(cbg): """ Convert LowSimilarityRegion 2 GeneTree @attention: function just converts, error check is not performed here! @type cbg: LowSimilarityRegion @param cbg: LowSimilarityRegion instance @rtype: GeneTreeGraph @return: GeneTreeGraph instance """ gtg = GeneTreeGraph() cbgnode2orgnode = {} for node in cbg.get_nodes(): org = cbg._organism_from_node(node) gtg.add_node(org) # add node/org combi to mapping dict cbgnode2orgnode[ node ] = org # now add all the edges omsr = cbg.overall_minimal_spanning_range() for (n1,n2) in cbg.pairwisecrosscombinations_node(): if cbg.has_edge(n1,n2): # get pacbp(orf) object pacbps = cbg.get_pacbps_by_nodes(node1=n1,node2=n2) if pacbps: identityscore = pacbps[0].identityscore else: # this edge has no pacbp in the lsrCBG -> happens often identityscore = 0.0 else: # this edge is absent in the lsrCBG! identityscore = 0.0 # get organism identifyers from node and add edge o1,o2 = cbgnode2orgnode[ n1 ], cbgnode2orgnode[ n2 ] gtg.add_edge( o1, o2, wt=identityscore ) # check if the graph is saturated (complete) # if not (organism/node/orf missing), add this as a zero-wt edge gtg.makecompletegraph(wt=0.0) # and return this new genetree graph return gtg
def CodingBlockGraph2GeneTreeGraph(cbg): """ Convert CodingBlockGraph 2 GeneTree @attention: function just converts, error check is not performed here! @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph instance @rtype: GeneTreeGraph @return: GeneTreeGraph instance """ gtg = GeneTreeGraph() cbgnode2orgnode = {} for node in cbg.get_nodes(): org = cbg._organism_from_node(node) gtg.add_node(org) # add node/org combi to mapping dict cbgnode2orgnode[ node ] = org # now add all the edges omsr = cbg.overall_minimal_spanning_range() for (n1,n2) in cbg.pairwisecrosscombinations_node(): if cbg.has_edge(n1,n2): # get pacbp(orf) object thepacbp = cbg.get_pacbps_by_nodes(node1=n1,node2=n2)[0] # get relative coordinates of the OMSR part of the alignment omsrQs = thepacbp.alignmentposition_by_query_pos( min( omsr[n1] ) ) omsrQe = thepacbp.alignmentposition_by_query_pos( max( omsr[n1] ) ) # CHECK these coordinates; pacb.exceptions.CoordinateOutOfRange can occur # in freaky cases. They shouldn't, but do without discovered reason. # However, in the majority of cases, it is just a 1/few aa offset, which # can be easily corrected here. if str(omsrQs) == str(pacb.exceptions.CoordinateOutOfRange): if thepacbp.__class__.__name__ == 'PacbP': # solve by taking thepacbp.query_start omsrQs = thepacbp.alignmentposition_by_query_pos( thepacbp.query_start ) else: # thepacbp.__class__.__name__ in ['PacbPDNA','PacbPORF']: # solve by taking orginal alignment position start omsrQs = thepacbp.alignmentposition_by_query_pos( thepacbp._get_original_alignment_pos_start().query_pos ) ########################################################################### ## print warning message(s) #print "WARNING: pacb.exceptions.CoordinateOutOfRange (omsrQs, ", #print "node %s in CodingBlockGraph2GeneTreeGraph" % ( str(n1) ) #print "WARNING: min(omsr(", min( omsr[n1] ), ")", min(omsr[n1]), #print max(omsr[n1]), " taken ->", thepacbp.query_start, omsrQs #print "WARNING: ", thepacbp ########################################################################### if str(omsrQe) == str(pacb.exceptions.CoordinateOutOfRange): if thepacbp.__class__.__name__ == 'PacbP': # solve by taking thepacbp.query_end omsrQe = thepacbp.alignmentposition_by_query_pos( thepacbp.query_end ) else: # thepacbp.__class__.__name__ in ['PacbPDNA','PacbPORF']: # solve by taking orginal alignment position end omsrQe = thepacbp.alignmentposition_by_query_pos( thepacbp._get_original_alignment_pos_end().query_pos ) + 1 # add +1 to create a python list range coordinate ########################################################################### ## print warning message(s) #print "WARNING: pacb.exceptions.CoordinateOutOfRange (omsrQe, ", #print node %s in CodingBlockGraph2GeneTreeGraph" % ( str(n1) ) #print "WARNING: max(omsr(", max( omsr[n1] ), ")", min(omsr[n1]), #print max(omsr[n1]), " taken ->", thepacbp.query_end, omsrQe #print "WARNING: ", thepacbp ########################################################################### else: # omsrQe was nicely an integer; add +1 because max(OMSR) is not a range coord omsrQe += 1 # calculate identityscore identityscore = pacb.calculate_identityscore( thepacbp.alignment[omsrQs:omsrQe] ) else: # this edge is absent in the CBG! # TODO -> this will cause a crash a few lines later # by definition, a CBG MUST HAVE ALL EDGES at this stage! print "about to crash!!!!" print cbg print cbg.node_count(), cbg.edge_count(), "missing:", (n1,n2) identityscore = 0.0 # get organism identifyers from node and add edge o1,o2 = cbgnode2orgnode[ n1 ], cbgnode2orgnode[ n2 ] # Wt used is identityscore == Identity + 0.5* Similarity gtg.add_edge( o1, o2, wt=identityscore ) # add additional statistics to gtg object. Wt used is # identitypercentage is TRUE aa indentity % identityperc = pacb.calculate_identity( thepacbp.alignment[omsrQs:omsrQe] ) gtg._aa_identity_percentages[(o1,o2)] = identityperc gtg._aa_identity_percentages[(o2,o1)] = identityperc # bitscoreratio is ratio of bits / max bits bitscoreratio = pacb.calculate_bitscoreratio( thepacbp.query[omsrQs:omsrQe], thepacbp.sbjct[omsrQs:omsrQe], matrix = thepacbp.MATRIX ) gtg._bitscore_ratios[(o1,o2)] = bitscoreratio gtg._bitscore_ratios[(o2,o1)] = bitscoreratio # ntidentity is obviously nt identity% dnaQseq, dnaSseq = thepacbp.get_unextended_aligned_dna_sequences() ntidentity = sequence_identity_ratio(dnaQseq,dnaSseq) gtg._nt_identity_percentages[(o1,o2)] = ntidentity gtg._nt_identity_percentages[(o2,o1)] = ntidentity # check if the graph is saturated (complete) # if not (organism/node/orf missing), add this as a zero-wt edge gtg.makecompletegraph(wt=0.0) # and return this new genetree graph return gtg
def CodingBlockGraph2GeneTreeGraph(cbg): """ Convert CodingBlockGraph 2 GeneTree @attention: function just converts, error check is not performed here! @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph instance @rtype: GeneTreeGraph @return: GeneTreeGraph instance """ gtg = GeneTreeGraph() cbgnode2orgnode = {} for node in cbg.get_nodes(): org = cbg._organism_from_node(node) gtg.add_node(org) # add node/org combi to mapping dict cbgnode2orgnode[node] = org # now add all the edges omsr = cbg.overall_minimal_spanning_range() for (n1, n2) in cbg.pairwisecrosscombinations_node(): if cbg.has_edge(n1, n2): # get pacbp(orf) object thepacbp = cbg.get_pacbps_by_nodes(node1=n1, node2=n2)[0] # get relative coordinates of the OMSR part of the alignment omsrQs = thepacbp.alignmentposition_by_query_pos(min(omsr[n1])) omsrQe = thepacbp.alignmentposition_by_query_pos(max(omsr[n1])) # CHECK these coordinates; pacb.exceptions.CoordinateOutOfRange can occur # in freaky cases. They shouldn't, but do without discovered reason. # However, in the majority of cases, it is just a 1/few aa offset, which # can be easily corrected here. if str(omsrQs) == str(pacb.exceptions.CoordinateOutOfRange): if thepacbp.__class__.__name__ == 'PacbP': # solve by taking thepacbp.query_start omsrQs = thepacbp.alignmentposition_by_query_pos( thepacbp.query_start) else: # thepacbp.__class__.__name__ in ['PacbPDNA','PacbPORF']: # solve by taking orginal alignment position start omsrQs = thepacbp.alignmentposition_by_query_pos( thepacbp._get_original_alignment_pos_start().query_pos) ########################################################################### ## print warning message(s) #print "WARNING: pacb.exceptions.CoordinateOutOfRange (omsrQs, ", #print "node %s in CodingBlockGraph2GeneTreeGraph" % ( str(n1) ) #print "WARNING: min(omsr(", min( omsr[n1] ), ")", min(omsr[n1]), #print max(omsr[n1]), " taken ->", thepacbp.query_start, omsrQs #print "WARNING: ", thepacbp ########################################################################### if str(omsrQe) == str(pacb.exceptions.CoordinateOutOfRange): if thepacbp.__class__.__name__ == 'PacbP': # solve by taking thepacbp.query_end omsrQe = thepacbp.alignmentposition_by_query_pos( thepacbp.query_end) else: # thepacbp.__class__.__name__ in ['PacbPDNA','PacbPORF']: # solve by taking orginal alignment position end omsrQe = thepacbp.alignmentposition_by_query_pos( thepacbp._get_original_alignment_pos_end().query_pos ) + 1 # add +1 to create a python list range coordinate ########################################################################### ## print warning message(s) #print "WARNING: pacb.exceptions.CoordinateOutOfRange (omsrQe, ", #print node %s in CodingBlockGraph2GeneTreeGraph" % ( str(n1) ) #print "WARNING: max(omsr(", max( omsr[n1] ), ")", min(omsr[n1]), #print max(omsr[n1]), " taken ->", thepacbp.query_end, omsrQe #print "WARNING: ", thepacbp ########################################################################### else: # omsrQe was nicely an integer; add +1 because max(OMSR) is not a range coord omsrQe += 1 # calculate identityscore identityscore = pacb.calculate_identityscore( thepacbp.alignment[omsrQs:omsrQe]) else: # this edge is absent in the CBG! # TODO -> this will cause a crash a few lines later # by definition, a CBG MUST HAVE ALL EDGES at this stage! print "about to crash!!!!" print cbg print cbg.node_count(), cbg.edge_count(), "missing:", (n1, n2) identityscore = 0.0 # get organism identifyers from node and add edge o1, o2 = cbgnode2orgnode[n1], cbgnode2orgnode[n2] # Wt used is identityscore == Identity + 0.5* Similarity gtg.add_edge(o1, o2, wt=identityscore) # add additional statistics to gtg object. Wt used is # identitypercentage is TRUE aa indentity % identityperc = pacb.calculate_identity( thepacbp.alignment[omsrQs:omsrQe]) gtg._aa_identity_percentages[(o1, o2)] = identityperc gtg._aa_identity_percentages[(o2, o1)] = identityperc # bitscoreratio is ratio of bits / max bits bitscoreratio = pacb.calculate_bitscoreratio( thepacbp.query[omsrQs:omsrQe], thepacbp.sbjct[omsrQs:omsrQe], matrix=thepacbp.MATRIX) gtg._bitscore_ratios[(o1, o2)] = bitscoreratio gtg._bitscore_ratios[(o2, o1)] = bitscoreratio # ntidentity is obviously nt identity% dnaQseq, dnaSseq = thepacbp.get_unextended_aligned_dna_sequences() ntidentity = sequence_identity_ratio(dnaQseq, dnaSseq) gtg._nt_identity_percentages[(o1, o2)] = ntidentity gtg._nt_identity_percentages[(o2, o1)] = ntidentity # check if the graph is saturated (complete) # if not (organism/node/orf missing), add this as a zero-wt edge gtg.makecompletegraph(wt=0.0) # and return this new genetree graph return gtg