コード例 #1
0
def assign_gtgdiscrepancy_inwpcbgs(inwpcbgs,GTG,exclude_annotated=True,verbose=True):
    """ """
    # return empty list when no inwpcbgs applied
    if not inwpcbgs: return []

    # get target organism identifier
    target = inwpcbgs[0]._get_target_organism()

    # return list with inwpcbgs
    gtgdiscrepancy_inwpcbg_list = []

    if exclude_annotated:
        # get most likely first & final inwpCBG pointer in inwpcbgs list
        posFirst,posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs)
        range_5p_test = range(0,posFirst)
        range_3p_test = range(posFinal+1,len(inwpcbgs))
        protected_target_orfid_list = []
        for inwpCBG in inwpcbgs[posFirst:posFinal+1]:
            if inwpCBG.count_orfs_labeled_as_annotated_exon() > 0:
                protected_target_orfid_list.append( inwpCBG.get_orfs_of_graph(organism=target)[0].id )
    else:
        range_5p_test = []
        range_3p_test = []
        protected_target_orfid_list = []

    ############################################################################
    if verbose and exclude_annotated:
        print "NOT-excluded:", range_5p_test, range_3p_test
    ############################################################################

    # detect UTR or nongene / noncoding inwpCBGS
    for pos in range(0,len(inwpcbgs)):
        if exclude_annotated and pos in range_5p_test:
            pass
        elif exclude_annotated and pos in range_3p_test:
            pass
        elif exclude_annotated and inwpcbgs[pos].count_orfs_labeled_as_annotated_exon() == 0:
            # in the middle of the annotated geen structure, but not a single
            # Orf annotated as an exon. Asses for gtg difference too!
            pass
        elif exclude_annotated:
            continue
        else:
            pass


        # get this inwpCBG and 
        thisInwpCBG = inwpcbgs[pos]

        # ignore if the target's Orf is belonging to a `protected` Orf
        if protected_target_orfid_list and\
        thisInwpCBG.get_orfs_of_graph(organism=target)[0].id in\
        protected_target_orfid_list:
            continue

        # ignore inwpCBGs which are very likely (poor quality) SignalP alignments
        cntSP = float(thisInwpCBG.count_orfs_with_signalpeptides())
        if cntSP/(thisInwpCBG.count_genomic_informants()+1) > 0.66 and\
        thisInwpCBG.get_signalp_score() > 0.75:
            continue

        # create its GeneTreeGraph
        gtg = pcg2gtg_by_identity(thisInwpCBG,target)

        # step 1. Do the gtg/GTG difference check
        difference = _relative_gtg_difference(gtg,GTG,target)

        if difference < NONGENE_GTG_MAX_DIFFERENCE:
            # step 2. Do the CEXPANDER check
            if thisInwpCBG.node_count() <= 2:
                gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG)
                ################################################################
                if verbose:
                    print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % (
                        difference,NONGENE_GTG_MAX_DIFFERENCE),
                    print thisInwpCBG.get_organism_nodes(target)[0]
                ################################################################
            elif thisInwpCBG.get_cexpander_uniformly_aligned_count() == 0:
                gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG)
                ################################################################
                if verbose:
                    print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % (
                        difference,NONGENE_GTG_MAX_DIFFERENCE),
                    print thisInwpCBG.get_organism_nodes(target)[0]
                ################################################################
            else:
                # cexpander check is succesfull, GTGdifference claims
                # the aligment is bogus. Do a more elaborate check on
                # some other variables of thisInwpCBG

                # calculate the difference between minsr & maxsr lengths
                node      = thisInwpCBG.get_organism_nodes(target)[0]
                minsr     = thisInwpCBG.minimal_spanning_range_sizes()[node]
                maxsr     = thisInwpCBG.maximal_spanning_range_sizes()[node]
                msr_ratio = float(minsr)/float(maxsr)

                # calculate the ratio between average weights of gtg and GTG
                average_wt_gtg = _pairwise_gtg_average_weight(gtg,target)
                average_wt_GTG = _pairwise_gtg_average_weight(GTG,target)
                gtg_ratio = average_wt_gtg / average_wt_GTG

                if msr_ratio < NONGENE_GTG_MAX_MSR_RATIO and\
                gtg_ratio < NONGENE_GTG_MAX_GTG_RATIO:
                    gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG)
                    ################################################################
                    if verbose:
                        print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % (
                            difference,NONGENE_GTG_MAX_DIFFERENCE),
                        print thisInwpCBG.get_organism_nodes(target)[0]
                    ################################################################
                else:
                    pass
        else:
            pass


    # return the gtgdiscrepancy_inwpcbg_list
    return gtgdiscrepancy_inwpcbg_list
コード例 #2
0
def assign_gtgdiscrepancy_inwpcbgs(inwpcbgs,
                                   GTG,
                                   exclude_annotated=True,
                                   verbose=True):
    """ """
    # return empty list when no inwpcbgs applied
    if not inwpcbgs: return []

    # get target organism identifier
    target = inwpcbgs[0]._get_target_organism()

    # return list with inwpcbgs
    gtgdiscrepancy_inwpcbg_list = []

    if exclude_annotated:
        # get most likely first & final inwpCBG pointer in inwpcbgs list
        posFirst, posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs)
        range_5p_test = range(0, posFirst)
        range_3p_test = range(posFinal + 1, len(inwpcbgs))
        protected_target_orfid_list = []
        for inwpCBG in inwpcbgs[posFirst:posFinal + 1]:
            if inwpCBG.count_orfs_labeled_as_annotated_exon() > 0:
                protected_target_orfid_list.append(
                    inwpCBG.get_orfs_of_graph(organism=target)[0].id)
    else:
        range_5p_test = []
        range_3p_test = []
        protected_target_orfid_list = []

    ############################################################################
    if verbose and exclude_annotated:
        print "NOT-excluded:", range_5p_test, range_3p_test
    ############################################################################

    # detect UTR or nongene / noncoding inwpCBGS
    for pos in range(0, len(inwpcbgs)):
        if exclude_annotated and pos in range_5p_test:
            pass
        elif exclude_annotated and pos in range_3p_test:
            pass
        elif exclude_annotated and inwpcbgs[
                pos].count_orfs_labeled_as_annotated_exon() == 0:
            # in the middle of the annotated geen structure, but not a single
            # Orf annotated as an exon. Asses for gtg difference too!
            pass
        elif exclude_annotated:
            continue
        else:
            pass

        # get this inwpCBG and
        thisInwpCBG = inwpcbgs[pos]

        # ignore if the target's Orf is belonging to a `protected` Orf
        if protected_target_orfid_list and\
        thisInwpCBG.get_orfs_of_graph(organism=target)[0].id in\
        protected_target_orfid_list:
            continue

        # ignore inwpCBGs which are very likely (poor quality) SignalP alignments
        cntSP = float(thisInwpCBG.count_orfs_with_signalpeptides())
        if cntSP/(thisInwpCBG.count_genomic_informants()+1) > 0.66 and\
        thisInwpCBG.get_signalp_score() > 0.75:
            continue

        # create its GeneTreeGraph
        gtg = pcg2gtg_by_identity(thisInwpCBG, target)

        # step 1. Do the gtg/GTG difference check
        difference = _relative_gtg_difference(gtg, GTG, target)

        if difference < NONGENE_GTG_MAX_DIFFERENCE:
            # step 2. Do the CEXPANDER check
            if thisInwpCBG.node_count() <= 2:
                gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG)
                ################################################################
                if verbose:
                    print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % (
                        difference, NONGENE_GTG_MAX_DIFFERENCE),
                    print thisInwpCBG.get_organism_nodes(target)[0]
                ################################################################
            elif thisInwpCBG.get_cexpander_uniformly_aligned_count() == 0:
                gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG)
                ################################################################
                if verbose:
                    print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % (
                        difference, NONGENE_GTG_MAX_DIFFERENCE),
                    print thisInwpCBG.get_organism_nodes(target)[0]
                ################################################################
            else:
                # cexpander check is succesfull, GTGdifference claims
                # the aligment is bogus. Do a more elaborate check on
                # some other variables of thisInwpCBG

                # calculate the difference between minsr & maxsr lengths
                node = thisInwpCBG.get_organism_nodes(target)[0]
                minsr = thisInwpCBG.minimal_spanning_range_sizes()[node]
                maxsr = thisInwpCBG.maximal_spanning_range_sizes()[node]
                msr_ratio = float(minsr) / float(maxsr)

                # calculate the ratio between average weights of gtg and GTG
                average_wt_gtg = _pairwise_gtg_average_weight(gtg, target)
                average_wt_GTG = _pairwise_gtg_average_weight(GTG, target)
                gtg_ratio = average_wt_gtg / average_wt_GTG

                if msr_ratio < NONGENE_GTG_MAX_MSR_RATIO and\
                gtg_ratio < NONGENE_GTG_MAX_GTG_RATIO:
                    gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG)
                    ################################################################
                    if verbose:
                        print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % (
                            difference, NONGENE_GTG_MAX_DIFFERENCE),
                        print thisInwpCBG.get_organism_nodes(target)[0]
                    ################################################################
                else:
                    pass
        else:
            pass

    # return the gtgdiscrepancy_inwpcbg_list
    return gtgdiscrepancy_inwpcbg_list
コード例 #3
0
def detect_and_remove_gtgdiscrepancy(inwpcbgs,PCG,GENE_IDENTIFIER_SET,verbose=True):
    """ """

    # if empty list or empty PCG provided: return False
    if not inwpcbgs or not PCG or PCG.node_count() == 0: return False

    # get target organism identifier
    target = inwpcbgs[0]._get_target_organism()

    # Make *the* GTG of the strongest X informant species
    # X depends on the maximum number of gene informants (GENE_IDENTIFIER_SET);
    # unigene informants are not taken into account here.
    # X is defined here by:
    # -- at least 3 informants (for very small number of informants)
    # -- optimally half of the total numers of informants
    # -- at most 8 informants
    min_gtg_node_count = 3 + 1
    max_gtg_node_count = 8 + 1
    gtg_size = min([(len(GENE_IDENTIFIER_SET)-1)/2, max_gtg_node_count])
    gtg_size = max([min_gtg_node_count,gtg_size])

    btGTG = pcg2gtg_by_bitscore(PCG,target,identifier_list=GENE_IDENTIFIER_SET)
    ntGTG = pcg2gtg_by_identity(PCG,target,identifier_list=GENE_IDENTIFIER_SET)

    # TEMP solution because OrganismGraph != OrganismStarGraph
    # make bitscore ordered list of nodes
    bitscore_ordered_nodes = []
    for (tNode,iNode),wt in btGTG.weights.iteritems():
        if tNode==target: bitscore_ordered_nodes.append( ( wt, iNode ) )
    bitscore_ordered_nodes.sort() 
    #if verbose: print "btGTG::", bitscore_ordered_nodes

    while ntGTG.node_count() > gtg_size:
        # next line causes errors because OrganismGraph != OrganismStarGraph
        # this causes the target node in rare cases to be assigned as the weakest node
        # informant = btGTG.weakest_connected_node()
        (wt,informant) = bitscore_ordered_nodes.pop(0)
        btGTG.del_node(informant)
        ntGTG.del_node(informant)
        if verbose: print "btGGT.weakest_connected_node() ==", informant, btGTG.get_ordered_nodes()

    ############################################################################
    if verbose:
        print "ntGTG:", ntGTG.get_ordered_nodes(), 
        for node in ntGTG.get_ordered_nodes():
            if node == target: continue
            print "%1.2f" % ntGTG.weights[(target,node)],
        print ""
    ############################################################################

    # detect inwpCBGs which are probably the result of intron alignments
    gtgdiscrepancy_internal_inwpcbg_list = assign_internal_nongene_alignments(inwpcbgs,ntGTG)


    # detect inwpCBGs with strong discrepancy to this GTG
    gtgdiscrepancy_inwpcbg_list = assign_gtgdiscrepancy_inwpcbgs(inwpcbgs,ntGTG)

    # merge both lists
    if gtgdiscrepancy_internal_inwpcbg_list:
        if not gtgdiscrepancy_inwpcbg_list:
            gtgdiscrepancy_inwpcbg_list.extend(gtgdiscrepancy_internal_inwpcbg_list)
        else:
            for inwpcbg in gtgdiscrepancy_internal_inwpcbg_list:
                check_str = str(inwpcbg)
                if check_str not in [ str(gtgdiscrCBG) for gtgdiscrCBG in gtgdiscrepancy_inwpcbg_list ]:
                    gtgdiscrepancy_inwpcbg_list.append( inwpcbg )

    if not gtgdiscrepancy_inwpcbg_list:
        return False

    # get list of inwpCBGs that have NO discrepancy
    correct_inwpcbg_list = []
    check_str_list = []
    for discrinwpCBG in gtgdiscrepancy_inwpcbg_list:
        check_str_list.append( str(discrinwpCBG) )
    for inwpcbg in inwpcbgs:
        if str(inwpcbg) not in check_str_list:
            correct_inwpcbg_list.append( inwpcbg )

    # get all pacbp keys belonging to gtgdiscrepancy inwpcbgs ONLY
    gtgdiscrepancy_pacbpkeys = []
    for discrinwpCBG in gtgdiscrepancy_inwpcbg_list:
        for pacbpkey in discrinwpCBG.pacbps.keys():
            # check if this pacbpkey is occuring in a non-removed inwpCBG
            is_occurring_in_correct_inwpcbg = False
            for inwp in correct_inwpcbg_list:
                if pacbpkey in inwp.pacbps.keys():
                    is_occurring_in_correct_inwpcbg = True
                    break
            # if is_occurring_in_correct_inwpcbg, continue and do not delete
            if is_occurring_in_correct_inwpcbg:
                continue
            # store to gtgdiscrepancy_pacbpkeys when not stored already
            if pacbpkey not in gtgdiscrepancy_pacbpkeys:
                gtgdiscrepancy_pacbpkeys.append(pacbpkey)


    # place all gtgdiscrepancy_pacbpkeys and PacbPORFs in the gtgdiscrepancyPCG
    # and, at the same time, remove from the main PCG
    gtgdiscrepancyPCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix)
    for key in gtgdiscrepancy_pacbpkeys:
        if key not in PCG.pacbps.keys():
            # !?!? TODO why not present in the PCG !?!?!
            # anyway, continue here to avoid KeyError
            # This PacbPORF was to be deleted rigth here,
            # so it is not an extreme disaster. But... scary ;-)
            continue
        (pacbpkey,nodeQ,nodeS) = key
        pacbporf = PCG.pacbps[key]
        # add to gtgdiscrepancyPCG
        gtgdiscrepancyPCG.add_node(nodeQ)
        gtgdiscrepancyPCG.add_node(nodeS)
        gtgdiscrepancyPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
        gtgdiscrepancyPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf

        # remove from main PCG
        _delete_pacbp(PCG,key)


    # return gtgdiscrepancyPCG
    return gtgdiscrepancyPCG
コード例 #4
0
def detect_and_remove_gtgdiscrepancy(inwpcbgs,
                                     PCG,
                                     GENE_IDENTIFIER_SET,
                                     verbose=True):
    """ """

    # if empty list or empty PCG provided: return False
    if not inwpcbgs or not PCG or PCG.node_count() == 0: return False

    # get target organism identifier
    target = inwpcbgs[0]._get_target_organism()

    # Make *the* GTG of the strongest X informant species
    # X depends on the maximum number of gene informants (GENE_IDENTIFIER_SET);
    # unigene informants are not taken into account here.
    # X is defined here by:
    # -- at least 3 informants (for very small number of informants)
    # -- optimally half of the total numers of informants
    # -- at most 8 informants
    min_gtg_node_count = 3 + 1
    max_gtg_node_count = 8 + 1
    gtg_size = min([(len(GENE_IDENTIFIER_SET) - 1) / 2, max_gtg_node_count])
    gtg_size = max([min_gtg_node_count, gtg_size])

    btGTG = pcg2gtg_by_bitscore(PCG,
                                target,
                                identifier_list=GENE_IDENTIFIER_SET)
    ntGTG = pcg2gtg_by_identity(PCG,
                                target,
                                identifier_list=GENE_IDENTIFIER_SET)

    # TEMP solution because OrganismGraph != OrganismStarGraph
    # make bitscore ordered list of nodes
    bitscore_ordered_nodes = []
    for (tNode, iNode), wt in btGTG.weights.iteritems():
        if tNode == target: bitscore_ordered_nodes.append((wt, iNode))
    bitscore_ordered_nodes.sort()
    #if verbose: print "btGTG::", bitscore_ordered_nodes

    while ntGTG.node_count() > gtg_size:
        # next line causes errors because OrganismGraph != OrganismStarGraph
        # this causes the target node in rare cases to be assigned as the weakest node
        # informant = btGTG.weakest_connected_node()
        (wt, informant) = bitscore_ordered_nodes.pop(0)
        btGTG.del_node(informant)
        ntGTG.del_node(informant)
        if verbose:
            print "btGGT.weakest_connected_node() ==", informant, btGTG.get_ordered_nodes(
            )

    ############################################################################
    if verbose:
        print "ntGTG:", ntGTG.get_ordered_nodes(),
        for node in ntGTG.get_ordered_nodes():
            if node == target: continue
            print "%1.2f" % ntGTG.weights[(target, node)],
        print ""
    ############################################################################

    # detect inwpCBGs which are probably the result of intron alignments
    gtgdiscrepancy_internal_inwpcbg_list = assign_internal_nongene_alignments(
        inwpcbgs, ntGTG)

    # detect inwpCBGs with strong discrepancy to this GTG
    gtgdiscrepancy_inwpcbg_list = assign_gtgdiscrepancy_inwpcbgs(
        inwpcbgs, ntGTG)

    # merge both lists
    if gtgdiscrepancy_internal_inwpcbg_list:
        if not gtgdiscrepancy_inwpcbg_list:
            gtgdiscrepancy_inwpcbg_list.extend(
                gtgdiscrepancy_internal_inwpcbg_list)
        else:
            for inwpcbg in gtgdiscrepancy_internal_inwpcbg_list:
                check_str = str(inwpcbg)
                if check_str not in [
                        str(gtgdiscrCBG)
                        for gtgdiscrCBG in gtgdiscrepancy_inwpcbg_list
                ]:
                    gtgdiscrepancy_inwpcbg_list.append(inwpcbg)

    if not gtgdiscrepancy_inwpcbg_list:
        return False

    # get list of inwpCBGs that have NO discrepancy
    correct_inwpcbg_list = []
    check_str_list = []
    for discrinwpCBG in gtgdiscrepancy_inwpcbg_list:
        check_str_list.append(str(discrinwpCBG))
    for inwpcbg in inwpcbgs:
        if str(inwpcbg) not in check_str_list:
            correct_inwpcbg_list.append(inwpcbg)

    # get all pacbp keys belonging to gtgdiscrepancy inwpcbgs ONLY
    gtgdiscrepancy_pacbpkeys = []
    for discrinwpCBG in gtgdiscrepancy_inwpcbg_list:
        for pacbpkey in discrinwpCBG.pacbps.keys():
            # check if this pacbpkey is occuring in a non-removed inwpCBG
            is_occurring_in_correct_inwpcbg = False
            for inwp in correct_inwpcbg_list:
                if pacbpkey in inwp.pacbps.keys():
                    is_occurring_in_correct_inwpcbg = True
                    break
            # if is_occurring_in_correct_inwpcbg, continue and do not delete
            if is_occurring_in_correct_inwpcbg:
                continue
            # store to gtgdiscrepancy_pacbpkeys when not stored already
            if pacbpkey not in gtgdiscrepancy_pacbpkeys:
                gtgdiscrepancy_pacbpkeys.append(pacbpkey)

    # place all gtgdiscrepancy_pacbpkeys and PacbPORFs in the gtgdiscrepancyPCG
    # and, at the same time, remove from the main PCG
    gtgdiscrepancyPCG = PacbpCollectionGraph(crossdata={},
                                             blastmatrix=PCG._blastmatrix)
    for key in gtgdiscrepancy_pacbpkeys:
        if key not in PCG.pacbps.keys():
            # !?!? TODO why not present in the PCG !?!?!
            # anyway, continue here to avoid KeyError
            # This PacbPORF was to be deleted rigth here,
            # so it is not an extreme disaster. But... scary ;-)
            continue
        (pacbpkey, nodeQ, nodeS) = key
        pacbporf = PCG.pacbps[key]
        # add to gtgdiscrepancyPCG
        gtgdiscrepancyPCG.add_node(nodeQ)
        gtgdiscrepancyPCG.add_node(nodeS)
        gtgdiscrepancyPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
        gtgdiscrepancyPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf

        # remove from main PCG
        _delete_pacbp(PCG, key)

    # return gtgdiscrepancyPCG
    return gtgdiscrepancyPCG