Exemple #1
0
def remove_repeats_from_pacbp_list(pacbplist, overlap_ratio=0.85):
    """
    """
    if not pacbplist: return pacbplist
    # order pacbplist by `bits` attribute, highest first
    ordered = pacb.ordering.order_list_by_attribute(pacbplist,
                                                    order_by='bits',
                                                    reversed=True)
    # make upper cross of element indexes in ordered list of pacbps
    pairs = recombination.pairwise(range(0, len(ordered)))
    # loop over all pacbp combis, calculate overlap and store the
    # index to `toberemoved` when higher than `overlap_ratio`
    toberemoved = []
    for (posA, posB) in pairs:
        if posA in toberemoved: continue
        if posB in toberemoved: continue
        pacbpA = ordered[posA]
        pacbpB = ordered[posB]
        if ordering.overlap(pacbpA, pacbpB) >= overlap_ratio:
            toberemoved.append(posB)
    # order `toberemoved` and pop the (pacbp) elements from the ordered input list
    toberemoved.sort()
    toberemoved.reverse()
    for pos in toberemoved:
        ordered.pop(pos)
    # return the remainder of the odered input list
    return ordered
Exemple #2
0
def remove_repeats_from_pacbp_list(pacbplist, overlap_ratio=0.85):
    """
    """
    if not pacbplist:
        return pacbplist
    # order pacbplist by `bits` attribute, highest first
    ordered = pacb.ordering.order_list_by_attribute(pacbplist, order_by="bits", reversed=True)
    # make upper cross of element indexes in ordered list of pacbps
    pairs = recombination.pairwise(range(0, len(ordered)))
    # loop over all pacbp combis, calculate overlap and store the
    # index to `toberemoved` when higher than `overlap_ratio`
    toberemoved = []
    for (posA, posB) in pairs:
        if posA in toberemoved:
            continue
        if posB in toberemoved:
            continue
        pacbpA = ordered[posA]
        pacbpB = ordered[posB]
        if ordering.overlap(pacbpA, pacbpB) >= overlap_ratio:
            toberemoved.append(posB)
    # order `toberemoved` and pop the (pacbp) elements from the ordered input list
    toberemoved.sort()
    toberemoved.reverse()
    for pos in toberemoved:
        ordered.pop(pos)
    # return the remainder of the odered input list
    return ordered
Exemple #3
0
    def resolve_microsyntheny(
            self,
            absolute_max_intron_nt_length=ABSOLUTE_MAX_INTRON_NT_LENGTH,
            max_intron_nt_length=MAX_INTRON_NT_LENGTH,
            max_intergenic_min_nt_length=MAX_INTERGENIC_MIN_NT_LENGTH,
            average_intergenic_min_nt_length=AVERAGE_INTERGENIC_MIN_NT_LENGTH,
            min_intergenic_nt_length=MIN_INTERGENIC_NT_LENGTH,
            apply_tcode_check=False,
            verbose=False):
        """ 
        Assign codingblocks outside of *the* genestructure that belong to another gene (in a synthenic region)
            
        @type  max_intron_nt_length: integer (positive)
        @param max_intron_nt_length: maximal intron size -> any larger distance MUST be CBGs of 2 distinct genes
            
        @type  min_intergenic_nt_length: integer (positive)
        @param min_intergenic_nt_length: minimum intergenic size -> any distance smaller MUST be CBGs of a single gene

        @type  max_intergenic_min_nt_distance: integer (positive)
        @param max_intergenic_min_nt_distance: represent the upper border of a small intergenic region
                
                
        @attention: *the* genestructure is supposed to be the most central one in case of doubt
        """

        # List of list of position id's of CBGs together are (part of a) gene
        # When a distance is large enough for an intergenic region, the structure will
        # be something like [ [ 0,1,2], [3,4,5,6,7] ]. The first 3 CBGs are assigned to
        # another, frontal placed gene.
        structures = []
        for pos in range(0, len(self) - 1):
            this = self.codingblockgraphs[pos]
            next = self.codingblockgraphs[pos + 1]
            (mindist, maxdist,
             avdist) = _cbg_intergenic_distance_analyses(this, next)
            # check which nodes are mutual
            mutual = this.mutual_nodes(next)

            ########################################################
            if verbose:
                print pos, pos + 1,
                print "%s - %s - %s" % (mindist, avdist, maxdist),
                print mutual, "cbgIFopt?:",
                try:
                    print this._CBGinterface3p.optimalitycheck()
                except:
                    print "NO cbgIF!!"
            ########################################################

            if len(mutual) >= 1:
                # orf node(s) are shared -> this MUST be the same gene!
                pass
            elif maxdist > absolute_max_intron_nt_length:
                # largest distance is larger than the largest possible intron
                if not structures: structures.append(range(0, pos + 1))
                else: structures.append(range(structures[-1][-1] + 1, pos + 1))
            elif maxdist > max_intron_nt_length:
                # largest distance is larger than what is likely a very large intron
                # in this case: check the cbgIF
                if this._CBGinterface3p and this._CBGinterface3p.is_optimal() and\
                not next.have_all_starts_upstream_of_omsr():
                    # interface is optimal and 2th CBG is not a first TSS CBG.
                    # do not break the genestructure here.
                    pass
                else:
                    # break the genestructure in 2 parts
                    if not structures: structures.append(range(0, pos + 1))
                    else:
                        structures.append(
                            range(structures[-1][-1] + 1, pos + 1))
            elif mindist > min_intergenic_nt_length and maxdist > max_intergenic_min_nt_length:
                # the ``grey zone``, but very likely to be intergenic
                if this._CBGinterface3p and this._CBGinterface3p.is_compatible() and\
                this._CBGinterface3p.optimalitycheck().count(True) >= 2 and\
                not next.have_all_starts_upstream_of_omsr():
                    pass
                else:
                    if not structures: structures.append(range(0, pos + 1))
                    else:
                        structures.append(
                            range(structures[-1][-1] + 1, pos + 1))
            elif avdist > average_intergenic_min_nt_length:
                # very likely intergenic!
                if this._CBGinterface3p and this._CBGinterface3p.is_compatible() and\
                this._CBGinterface3p.optimalitycheck().count(True) >= 2 and\
                not next.have_all_starts_upstream_of_omsr():
                    pass
                else:
                    if not structures: structures.append(range(0, pos + 1))
                    else:
                        structures.append(
                            range(structures[-1][-1] + 1, pos + 1))
            elif maxdist <= min_intergenic_nt_length:
                # largest distance is smaller than ``smallest possible`` intergenic region
                pass
            else:
                # all other cases...many of these are spurious intergenic
                # but can as well be putative long introns. Do, if requested
                # for, the tcode_check. If not-> pass (and keep in the GSG)
                if apply_tcode_check:
                    # calculate TCODE averages
                    tcodescore = intergenecity_tcode_analyses(
                        this, next, self.input)
                    averages = [0.0, 0.0, 0.0, 0.0]
                    for org, data in tcodescore.iteritems():
                        for i in range(0, len(data)):
                            averages[i] += data[i]
                    averages = [
                        item / len(tcodescore.keys()) for item in averages
                    ]
                    omsrdists = this.omsr_distance_between_codingblocks(next)
                    lengthvar = [
                        float(a) / float(b)
                        for a, b in recombination.pairwise(omsrdists.values())
                    ]
                    lengthvar = sum(lengthvar) / len(lengthvar)
                    ########################################################
                    if verbose:
                        for org, data in tcodescore.iteritems():
                            print org, "\t", data
                        print "AVER:\t", averages, "*graAV:",
                        print this._stopcodongraph.average_weight(),
                        print "lenghtvar:", lengthvar
                        print this._CBGinterface3p
                    ########################################################

                    if averages[1] < INTERGENIC_TCODE_MAX_LOWEST_WINDOW and\
                    averages[2] < INTERGENIC_TCODE_MAX_AVERAGE_WINDOW and\
                    this._stopcodongraph.average_weight() >=\
                    INTERGENIC_TCODE_MIN_STGRA_AV_WT and\
                    (not this._CBGinterface3p or not this._CBGinterface3p.is_compatible()):
                        # yes, a somewhat less obvious intergenic boundary
                        if not structures: structures.append(range(0, pos + 1))
                        else:
                            structures.append(
                                range(structures[-1][-1] + 1, pos + 1))
                    else:
                        pass
                else:
                    pass
        else:
            if not structures: structures.append(range(0, len(self)))
            else: structures.append(range(structures[-1][-1] + 1, len(self)))

        # now deal with the results
        if len(structures) > 1:
            # hmmmm... most likely a - rare(!?) - region of microsyntheny
            # in all species. Get the best structure out of these
            best_structure = self._define_best_structure(structures,
                                                         verbose=verbose)
            self._update_best_structure_in_gsg(best_structure)
            ####################################################
            if verbose:
                print "microsyntheny", structures,
                print "MAX-intron:", max_intron_nt_length
                print "BEST:", best_structure