Exemple #1
0
def interval_intersect_interval(**kwargs):
    """
    Efficient algorithm to find which intervals intersect

    Handles both unix timestamp or datetime object

    Return:
    -------

    prediction_gt:
        array with same size as prediction,
        will be 1 if there's an overlapping label
        0 if not
    recall:
        recall percentage of labels
    overlap:
        how much overlap between label and prediction
    """

    gt = kwargs['groundtruth']
    pred = kwargs['prediction']

    # calculate recall
    tree = IntervalTree()
    for segment in pred:
        tree.add(Interval(segment[0],segment[1]))

    recall_gt = []
    for segment in gt:
        overlap = tree.search(segment[0], segment[1])
        if len(overlap) != 0:
            recall_gt.append(1)
        else:
            recall_gt.append(0)

    recall = np.mean(recall_gt)

    # calculate precision
    tree = IntervalTree()
    for segment in gt:
        tree.add(Interval(segment[0],segment[1]))

    prediction_gt = []
    for segment in pred:
        overlap = tree.search(segment[0], segment[1])
        if len(overlap) != 0:
            prediction_gt.append(1)
        else:
            prediction_gt.append(0)

    result = {'prediction_gt': prediction_gt,
              'recall_gt': recall_gt,
              'recall': recall,
              'precision': np.mean(prediction_gt)}

    return result
Exemple #2
0
    def mergeSegments(self,segs1,segs2,ignoreInsideEnvelope=True):
        """ Given two segmentations of the same file, return the merged set of them
        Two similar segments should be replaced by their union
        Those that are inside another should be removed (?) or the too-large one deleted?
        If ignoreInsideEnvelope is true this is the first of those, otherwise the second
        """

        from intervaltree import Interval, IntervalTree
        t = IntervalTree()

        # Put the first set into the tree
        for s in segs1:
            t[s[0]:s[1]] = s

        # Decide whether or not to put each segment in the second set in
        for s in segs2:
            overlaps = t.search(s[0],s[1])
            # If there are no overlaps, add it
            if len(overlaps)==0:
                t[s[0]:s[1]] = s
            else:
                # Search for any enveloped, if there are remove and add the new one
                envelops = t.search(s[0],s[1],strict=True)
                if len(envelops) > 0:
                    if ignoreInsideEnvelope:
                        # Remove any inside the envelope of the test point
                        t.remove_envelop(s[0],s[1])
                        overlaps = t.search(s[0], s[1])
                        #print s[0], s[1], overlaps
                        # Open out the region, delete the other
                        for o in overlaps:
                            if o.begin < s[0]:
                                s[0] = o.begin
                                t.remove(o)
                            if o.end > s[1]:
                                s[1] = o.end
                                t.remove(o)
                        t[s[0]:s[1]] = s
                else:
                    # Check for those that intersect the ends, widen them out a bit
                    for o in overlaps:
                        if o.begin > s[0]:
                            t[s[0]:o[1]] = (s[0],o[1])
                            t.remove(o)
                        if o.end < s[1]:
                            t[o[0]:s[1]] = (o[0],s[1])
                            t.remove(o)

        segs = []
        for a in t:
            segs.append([a[0],a[1]])
        return segs
Exemple #3
0
def get_multilines(spans):
    intervals = Intervals()
    lines = []
    for start, stop, type in spans:
        line = Line(start, stop, type, level=None)
        intervals.addi(start, stop, line)
        lines.append(line)

    # level
    for line in lines:
        selected = intervals.search(line.start, line.stop)
        line.level = get_free_level(selected)

    # chunk
    intervals.split_overlaps()

    # group
    groups = defaultdict(list)
    for start, stop, line in intervals:
        groups[start, stop].append(line)

    for start, stop in sorted(groups):
        lines = groups[start, stop]
        lines = sorted(lines, key=lambda _: _.level)
        yield Multiline(start, stop, lines)
def point_intersect_interval(points, df_interval):

    # store index of intervals as value of the interval
    tree = IntervalTree()
    for i in range(df_interval.shape[0]):
        tree[df_interval['start'][i]:df_interval['end'][i]] = i

    points_gt = np.zeros_like(points).astype(bool)

    interval_gt = [False] * df_interval.shape[0]

    for i in range(len(points)):

        intersection = tree.search(points[i])
        if len(intersection) == 0:
            points_gt[i] = False
        else:
            points_gt[i] = True

            for segment in intersection:
                interval_gt[segment.data] = True

    results = {}

    results['points_gt'] = points_gt
    results['interval_gt'] = interval_gt

    return results
def interval_intersect_interval(**kwargs):
    """Determine label of each segmentation based on
    intersection with labels
    
    Parameters
    ----------
    groundtruth:      dataframe containing columns 'Start', 'End', 'Label'
    segmentation:     dataframe containing columns 'Start', 'End'

    Return:
    -------
    label_segmentation:   list of labels, same number of rows as segmentation
        represent label of each segment

    """

    label_segmentation = []

    gt = kwargs['groundtruth']
    segmentation = kwargs['segmentation']

    tree = IntervalTree()
    for i in range(gt.shape[0]):
        tree.add(Interval(gt['Start'][i], gt['End'][i], i))

    index_covered = []

    for i in range(segmentation.shape[0]):
        interval = (segmentation.Start.iloc[i], segmentation.End.iloc[i])

        overlapping_labels = sorted(tree.search(interval[0], interval[1]))

        if len(overlapping_labels) == 0:
            logger.debug("A segment does not have overlapping groundtruth")

        elif len(overlapping_labels) == 1:
            # only one overlapping label
            label_segmentation.append(
                gt['Label'].iloc[overlapping_labels[0].data])
            index_covered.append(i)

        else:
            # majority voting:
            # if there are multiple overlapping labels,
            # select the one with largest overlap

            overlap_time = []
            for label in overlapping_labels:
                overlap_time.append(_get_overlap(label, interval))

            index_max = np.argmax(np.array(overlap_time))

            label_segmentation.append(
                gt['Label'].iloc[overlapping_labels[index_max].data])
            index_covered.append(i)
#    logger.info(label_segmentation)
    logger.info("Percentage of segments that do not have label: {}%".format(\
        100*(1 -len(index_covered)/len(label_segmentation))))

    return label_segmentation, index_covered
Exemple #6
0
  def get_merged_variants(self, variants, key=None):
    # type: (List[vcfio.Variant], str) -> Iterable[vcfio.Variant]
    non_variant_tree = IntervalTree()
    grouped_variants = collections.defaultdict(list)
    for v in variants:
      self._align_with_window(v, key)
      if self._is_non_variant(v):
        non_variant_tree.addi(v.start, v.end, v)
      else:
        group_key = next(self._move_to_calls.get_merge_keys(v))
        grouped_variants[group_key].append(v)

    non_variants = self._merge_non_variants(non_variant_tree)
    variants = self._merge_variants(grouped_variants)

    non_variant_tree.clear()
    for nv in non_variants:
      non_variant_tree.addi(nv.start, nv.end, nv)

    splits = IntervalTree()
    for v in variants:
      non_variant_interval = non_variant_tree.search(v.start, v.end)
      if non_variant_interval:
        non_variant = next(iter(non_variant_interval)).data
        v.calls.extend(non_variant.calls)
        v.calls = sorted(v.calls)
        self._update_splits(splits, v)
      yield v

    for non_variant in self._split_non_variants(non_variant_tree, splits):
      yield non_variant
Exemple #7
0
def test_brackets_vs_search():
    it = IntervalTree()
    it.addi(1, 3, "dude")
    it.addi(2, 4, "sweet")
    it.addi(6, 9, "rad")
    for iobj in it:
        assert it[iobj.begin:iobj.end] == it.search(iobj.begin, iobj.end)
Exemple #8
0
def point_intersect_interval(points, df_interval):
    """
    Expect both points and df_interval to be datetime object
    """

    # store index of intervals as value of the interval
    tree = IntervalTree()
    for i in range(df_interval.shape[0]):
        tree[df_interval['start'].iloc[i]:df_interval['end'].iloc[i]] = i

    points_gt = np.zeros_like(points).astype(bool)
    interval_gt = [False] * df_interval.shape[0]

    for i in range(len(points)):
        intersection = tree.search(points[i])
        if len(intersection) == 0:
            points_gt[i] = False
        else:
            points_gt[i] = True
    
            for segment in intersection:
                interval_gt[segment.data] = True

    results = {'points_gt': points_gt,
               'interval_gt': interval_gt}
    return results
Exemple #9
0
    def countIdealOverlaps(self, nodes):
        iTree = IntervalTree()
        for node in nodes:
            iTree.addi(node.idealLeft(), node.idealRight(), data=node)

        for node in nodes:
            overlaps = iTree.search(node.idealLeft(), node.idealRight())
            node.overlaps = [x.data for x in overlaps]
            node.overlapCount = len(overlaps)
Exemple #10
0
    def countIdealOverlaps(self, nodes):
        iTree = IntervalTree()
        for node in nodes:
            iTree.addi(node.idealLeft(), node.idealRight(), data=node)

        for node in nodes:
            overlaps = iTree.search(node.idealLeft(), node.idealRight())
            node.overlaps = [x.data for x in overlaps]
            node.overlapCount = len(overlaps)
Exemple #11
0
def original_print():
    it = IntervalTree()
    it.addi(1, 3, "dude")
    it.addi(2, 4, "sweet")
    it.addi(6, 9, "rad")
    for iobj in it:
        print(it[iobj.begin, iobj.end])  # set(), should be using :

    for iobj in it:
        print(it.search(iobj.begin, iobj.end))
Exemple #12
0
def find_overlapping_intervals(
        intervals: t.List[Interval]) -> t.List[Interval]:
    """
    Return any (but possibly not all) overlapping intervals.
    """
    tree = IntervalTree(intervals)
    for interval in tree:
        overlaps = tree.search(interval)
        if len(overlaps) > 1:
            return overlaps
    return []
Exemple #13
0
def get_overlapping_intervals(ranges_a, ranges_b):
    """
    Return a list of overlapping intervals
    """
    if len(ranges_a) < len(ranges_b):
        longer = ranges_b
        shorter = ranges_a
    else:
        longer = ranges_a
        shorter = ranges_b

    tree = IntervalTree()
    for s in longer:
        tree.add(Interval(s[0], s[1]))

    overlap = []
    for seg in shorter:
        overlap += [_intersect(s, seg) for s in tree.search(seg[0], seg[1])]

    return sorted(overlap)
Exemple #14
0
def get_multilines(spans):
    # level
    intervals = Intervals()
    for start, stop, type in sorted(spans):
        selected = intervals.search(start, stop)
        level = get_free_level(selected)
        intervals.addi(start, stop, Line(start, stop, type, level))

    # chunk
    intervals.split_overlaps()

    # group
    groups = defaultdict(list)
    for start, stop, line in intervals:
        groups[start, stop].append(line)

    for start, stop in sorted(groups):
        lines = groups[start, stop]
        lines = sorted(lines)
        yield Multiline(start, stop, lines)
Exemple #15
0
def read_rttm(input_path):
    """Read a RTTM file indicating gold diarization"""
    with open(input_path, 'r') as fin:
        # RTTM format is
        # SPEAKER fname 1 onset duration <NA> <NA> spkr <NA>
        rttm = fin.readlines()
        sad = IntervalTree()
        fname = ""
        for line in rttm:
            _, fname, _, onset, dur, _, _, _, _ = line.strip('\n').split()
            if float(dur) == 0:
                # Remove empty intervals
                continue
            elif float(dur) < 0:
                print(
                    "{} shows an interval with negative duration."
                    " Please inspect file, this shouldn't happen".format(line))
                continue

            interval = Interval(float(onset), float(onset) + float(dur))

            # Search for intervals already added that overlap with current
            # interval. If we find some, then we truncate the current
            # interval to remove all overalps
            ov = sad.search(interval)
            interval, other_intervals = remove_overlap(ov, interval)
            if interval[0] == interval[1]:
                # continue if interval was removed
                continue

            sad.add(interval)

            # if other_intervals is not empty, add these intervals to tree
            for new_interv in other_intervals:
                if new_interv[0] == new_interv[1]:
                    # continue if interval was removed
                    continue

                sad.add(new_interv)

    return sad, fname
Exemple #16
0
class GenomeAnnotation(object):
    """
    represents a genbank file
    and allows to efficiently annotate
    positions of interest
    """
    COLUMNS = [
        "type", "name", "locus", "product", 'protein_id', "strand", "start",
        "end"
    ]

    def __init__(self, genbank_file):
        """
        initializes the GenomeAnnotation object

        :param genbank_file: a path to a genbank file
        """
        self.genome_tree = IntervalTree()
        self.gene_dic = {}
        self.locus_dic = {}
        self.type_dic = {}
        self.genome_id = None
        self.length = None
        self.__read_genbank(genbank_file)

        # internal data structure for quick internal nearest gene search if position is not annotated
        tmp = []
        for v in (self.type_dic["CDS"] + self.type_dic["gene"]):
            tmp.extend([(v.start, v), (v.end, v)])

        tmp.sort(key=lambda x: x[0])
        self.__index_list = []
        self.__cds_list = []
        for pos, cds in tmp:
            self.__index_list.append(pos)
            self.__cds_list.append(cds)

    def __read_genbank(self, genbank_file):
        """
        reads the genbank file and stores its content in a interval tree
        and other searchable containers for efficient querying

        :param genbank_file: a path to a genbank file
        """
        ##print("old implementation")
        pseudogenes = []
        with open(genbank_file, "r") as f:
            my_type, name, locus, product, product_id, strand, start, end = None, None, None, None, None, None, None, None

            annotated_features = set()

            # states
            gathering = False
            comment_block = False
            annotation_block = False
            c = 0
            for l in f:
                # skip empty lines
                if l.strip() == "":
                    continue

                splits = l.split()

                if splits[0].startswith("LOCUS"):
                    ##print(splits)
                    self.genome_id = splits[1].strip()
                    self.length = int(splits[2].strip())

                # are we at the end of the annotation block?
                if splits[0].startswith("ORIGIN"):
                    break

                # check for parsing stage
                if splits[0].startswith("COMMENT"):
                    comment_block = True

                if splits[0].startswith("FEATURES"):
                    ##print(annotated_features)
                    annotation_block = True
                    comment_block = False

                # COMMENT block feature annotation
                if comment_block and splits[0].startswith("Fe"):

                    gathering = True
                    for an in splits[3:]:
                        if not an.startswith("Gene"):
                            annotated_features.add(an.split(";")[0])
                        else:
                            annotated_features.add("gene")

                # FEATURES Block here we found an entry that we want to gather
                if annotation_block and splits[
                        0] in annotated_features and ".." in splits[1]:

                    # first add already gathered entry into data structures
                    if locus is not None:
                        entry = GenomeEntry(my_type, name, locus, product,
                                            product_id, strand, start, end)

                        #if my_type == "PROMOTER":
                        #    print(entry)
                        # if its a gene annotation than first store it in temp for alter processing
                        if my_type == "gene":
                            pseudogenes.append(entry)
                        else:
                            if start > end:
                                ##print(entry)
                                c += 1
                                self.genome_tree.addi(start, self.length,
                                                      entry)
                                self.genome_tree.addi(0, end, entry)
                            else:
                                self.genome_tree.addi(start, end, entry)

                            self.locus_dic[locus] = entry
                            self.type_dic.setdefault(my_type, []).append(entry)

                            if name is not None:
                                self.gene_dic[name] = entry

                        my_type, name, locus, product, product_id, strand, start, end = None, None, None, None, None, None, None, None

                    gathering = True
                    my_type = splits[0]
                    # determine strand, start and end

                    if splits[1].startswith('comp'):
                        interval = splits[1].strip('complement()')
                        strand = '-'
                    else:
                        interval = splits[1]
                        strand = '+'
                    start, end = map(lambda x: int(x) - 1,
                                     interval.split('..'))
                    # TODO: this has to be fixed in the genbank file
                    if start == end:
                        end += 1

                # gather annotated elements
                if gathering:

                    # if we are in the comment block than we are gathering annotated features
                    if comment_block:
                        if "::" in splits:
                            gathering = False
                        else:
                            for s in splits:
                                annotated_features.add(s.split(";")[0])

                    # if we are in the annotation block than we gather infos distributed over multiple lines
                    if annotation_block:
                        if splits[0].startswith("/locus"):
                            locus = l.split("=")[-1].replace('"', '').replace(
                                "_", "").strip()
                        elif splits[0].startswith("/product"):
                            product = l.split("=")[-1].replace('"', '').strip()
                        elif splits[0].startswith("/gene"):
                            name = l.split("=")[-1].replace('"', '').strip()
                        elif splits[0].startswith("/protein_id"):
                            product_id = l.split("=")[-1].replace('"',
                                                                  '').strip()
                        else:
                            continue

            # end of file
            if locus is not None:
                entry = GenomeEntry(my_type, name, locus, product, product_id,
                                    strand, start, end)
                # if its a gene annotation than first store it in temp for alter processing
                #if my_type == "PROMOTER":
                #    print(entry)
                if my_type == "gene":
                    pseudogenes.append(entry)
                else:
                    start = entry.start
                    end = entry.end
                    if start > end:
                        ##print(entry)
                        c += 1
                        self.genome_tree.addi(start, self.length, entry)
                        self.genome_tree.addi(0, end, entry)
                    else:
                        self.genome_tree.addi(entry.start, entry.end, entry)

                    self.locus_dic[locus] = entry
                    self.type_dic.setdefault(type, []).append(entry)

                    if name is not None:
                        self.gene_dic[name] = entry
            ##print("Wrongly start end", c)
            for p in pseudogenes:
                # if this is true gene did not have another entry
                if p.locus not in self.locus_dic:
                    self.locus_dic[p.locus] = p
                    self.type_dic.setdefault(p.type, []).append(p)
                    self.genome_tree.addi(p.start, p.end, p)
                    if p.name is not None:
                        self.gene_dic[p.name] = p

    def _read_genbank2(self, genbank_file):

        gene_tmp = []
        nop = [None]
        with open(genbank_file, "r") as gbk:
            anno = SeqIO.read(gbk, "genbank")
            self.genome_id = anno.id
            self.length = len(anno)

            for rec in anno.features:
                if rec.type == "source":
                    continue
                else:
                    entry = GenomeEntry(
                        rec.type,
                        rec.qualifiers.get("gene", nop)[0],
                        rec.qualifiers.get("locus_tag", nop)[0],
                        rec.qualifiers.get("product", nop)[0],
                        rec.qualifiers.get("protein_id", nop)[0],
                        "+" if rec.strand else "-",
                        int(rec.location.start) - 1,
                        int(rec.location.end) - 1)
                    if entry.type == "gene":
                        gene_tmp.append(entry)
                    else:
                        start = entry.start
                        end = entry.end
                        if start > end:
                            self.genome_tree.addi(start, self.length, entry)
                            self.genome_tree.addi(0, end, entry)
                        else:
                            self.genome_tree.addi(entry.start, entry.end,
                                                  entry)

                        self.locus_dic[entry.locus] = entry
                        self.type_dic.setdefault(entry.type, []).append(entry)
                        if entry.name is not None:
                            self.gene_dic[entry.name] = entry

            for p in gene_tmp:
                # if this is true gene did not have another entry
                if p.locus not in self.locus_dic:
                    self.locus_dic[p.locus] = p
                    self.type_dic.setdefault(p.type, []).append(p)
                    self.genome_tree.addi(p.start, p.end, p)
                    if p.name is not None:
                        self.gene_dic[p.name] = p

    def __str__(self):
        return pd.DataFrame.from_records(list(self.locus_dic.values()),
                                         columns=self.COLUMNS).to_string()

    def annotate_positions(self, idx, aggregate=False):
        """
        annotates a list of positions with their associated genomic entries
        and returns a pandas dataframe with rows:

        pos, type, locus, name, product, strand, closest, distance, protein_pos, codon_pos

        :param idx: list of indices
        :return: pandas dataframe
        """

        # test if parameter is an iterable or int
        if isinstance(idx, int):
            idx = [idx]
        else:
            idx = list(set(idx))

        unknown = GenomeEntry("?", None, None, None, None, None, None, None)
        entries = []
        closest = []
        distance = []
        index = []

        protein_position = []
        codon_position = []

        for i in idx:
            data = self.genome_tree.search(i, strict=True)
            if data:
                # possible overlap of gene entries?
                for p in data:
                    #print(i, p.data)
                    index.append(i)
                    entries.append(p.data)
                    closest.append(None)
                    distance.append(None)
                    # calculate position within protein and codon position (1-indexed).
                    if p.data.strand == '+':
                        my_prot_pos = int(
                            (i - p.data.start) / 3) + 1  # int() rounds down.
                        my_codon_pos = ((i - p.data.start) % 3) + 1
                        ##print(my_prot_pos)
                    elif p.data.strand == '-':
                        my_prot_pos = int(
                            (p.data.end - i) / 3) + 1  # int() rounds down.
                        ##print(my_prot_pos)
                        my_codon_pos = ((p.data.end - i) % 3) + 1
                    else:
                        raise ValueError(
                            "strand annotation is invalid for gene, {}".format(
                                p.data.locus))
                    protein_position.append(my_prot_pos)
                    codon_position.append(my_codon_pos)
            else:
                # position is not annotated in GenomeAnnotation
                # find closest annotated CDS
                index.append(i)
                entries.append(unknown)
                i_clos = self.find_closest_gene(i)
                closest.append(i_clos.locus)
                distance.append(min(abs(i - i_clos.start),
                                    abs(i - i_clos.end)))
                protein_position.append(None)
                codon_position.append(None)

        anno_df = pd.DataFrame.from_records(entries, columns=self.COLUMNS)

        anno_df["pos"] = index
        anno_df["closest"] = closest
        anno_df["distance"] = distance

        anno_df["protein_pos"] = protein_position
        anno_df["codon_pos"] = codon_position

        if aggregate:
            anno_df = anno_df.groupby("pos").agg(
                lambda col: ';'.join(map(str, col)))
            anno_df.reset_index(inplace=True)
            print(anno_df.head())
        return anno_df[[
            "pos", "type", "locus", "name", "product", "protein_id", "strand",
            "closest", "distance", "start", "end", "protein_pos", "codon_pos"
        ]]

    def find_closest_gene(self, pos):
        """
        Returns closest value to pos.
        If two numbers are equally close, return the smallest number.

        :param pos: the genome position
        :return: GenomeEntry
        """
        idx = bisect_left(self.__index_list, pos)
        if idx == 0:
            return self.__cds_list[0]
        if idx == len(self.__index_list):
            return self.__cds_list[-1]
        before = self.__index_list[idx - 1]
        after = self.__index_list[idx]
        if after - pos < pos - before:
            return self.__cds_list[idx]
        else:
            return self.__cds_list[idx - 1]

    def annotate_genes(self, genes):
        """
        annotates a list of gene and returns a pandas dataframe
        with the following columns:

        type name locus product strand start end

        :param genes: list of genes names
        :return: pandas dataframe
        """
        if isinstance(genes, str):
            genes = [genes]

        entries = [self.gene_dic[g] for g in genes if g in self.gene_dic]
        return pd.DataFrame.from_records(entries, columns=self.COLUMNS)

    def annotate_loci(self, loci):
        """
        annotates a list of loci tags and returns a pandas dataframe
        with the following columns:

        type name locus product strand start end

        :param loci: list of locus names
        :return: pandas dataframe
        """
        if isinstance(loci, str):
            loci = [loci]

        entries = [self.locus_dic[g] for g in loci if g in self.locus_dic]
        return pd.DataFrame.from_records(entries, columns=self.COLUMNS)

    def annotate_type(self, types):
        """
        annotates a list of types  and returns a pandas dataframe
        with the following columns:

        type name locus product strand start end

        :param types: list of types
        :return: pandas dataframe
        """
        if isinstance(types, str):
            types = [types]

        entries = []
        for g in types:
            if g in self.type_dic:
                for e in self.type_dic[g]:
                    entries.append(e)
        return pd.DataFrame.from_records(entries, columns=self.COLUMNS)

    def annotate_dataframe(self,
                           df,
                           column,
                           suffix=("_x", "_y"),
                           aggregate=False):
        """
        annotate an existing dataframe

        :param df: data frame to which annotation is added
        :param column: specifies the genome position column
        :param suffix: tuple of suffix that is added overlapping column names (default: (_x, _y))
        :param aggregate: determines whether duplicated entry are aggregated as a semicolon separated string
        :return: pandas dataframe
        """
        idx = set(df[column])
        pos_df = self.annotate_positions(idx, aggregate=aggregate)

        df = df.merge(pos_df,
                      left_on=column,
                      right_on="pos",
                      how="inner",
                      suffixes=suffix)
        df.drop("pos", axis=1, inplace=True)

        return df
Exemple #17
0
                if bbint_last == bbint_curr: return False
            print("\nPCs: "),
            for pc in pcs:
                print ("%x " % pc),
            print "\nBB:"

            for instr in md.disasm(bbs[bbint_last], bbint_last.begin):
                if instr.address in pcs:
                    print("T "),
                else:
                    print("  "),
                print "0x%x:\t%s\t%s" %(instr.address, instr.mnemonic, instr.op_str)            
            return True
        while i<ls:
            pc = seq[l][i]
            bbint_curr = ct.search(pc)
            bbint_best = None
            for bbint in bbint_curr:
                if bbint_best is None:
                    bbint_best = bbint
                if bbint.contains_interval(bbint_best):
                    bbint_best = bbint
            if (spit(bbint_last, bbint_best, pcs)):
                pcs = []
            pcs.append(pc)
            i=i+1
            if i == ls:
                spit(bbint_best, None, pcs)
            bbint_last = bbint_best

    else:
Exemple #18
0
class virtual:
    def __init__(self):
        #mapea id drawables con su respectivo drawable
        self.idToDrawable = {}

        self.idToInterval= {}
        self.tags = {}

        #contine pares (intervaloX,idDrawable) que representan helperBoxs de elementos en espacio virtual
        self.intervalTreeX = IntervalTree()

        self.vista = None
        self.currentLocalId = 0


        self.stringTofunction = {}
        self.drawableInMemory=None

        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.DEBUG)
        fh = logging.FileHandler('virtualScreen.log')
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        self.logger.addHandler(fh)


    def setCommandString(self,command,function):
        self.logger.info('Adding new command %s for file recovery ',command)
        self.stringTofunction[command] = function


    def setView(self,vista):
        self.logger.info('Setting new view ')

        self.vista = vista
        self.setCommandString('setTag',lambda args : self.setTagLast(*args) )
        self.setCommandString('SETID',lambda args : self.placeDrawable(self.drawableInMemory,args[0]) )
        self.setCommandString('setViewWidthHeight',lambda args : self.vista.vistaSetWidthHeight(*args) )
        self.setCommandString('placeView',lambda args : self.vista.placeView(*args) )
        self.setCommandString('setViewScaleXY',lambda args : self.vista.setFactorXY(*args) )


        self.setCommandString('createRectangle',lambda args : self.setLastDrawableInMemory(self.createRectangle(*args,createId=False)) )
        self.setCommandString('createLine',lambda args : self.setLastDrawableInMemory(self.createLine(*args,createId=False)) )
        self.setCommandString('createGroup',lambda args : self.setLastDrawableInMemory(self.createGroup(*args,createId=False)) )
        self.setCommandString('createText', lambda args :self.setLastDrawableInMemory(self.createText(*args,createId=False)) )
        self.setCommandString('createPointDraw', lambda args : self.setLastDrawableInMemory(self.createPointDraw(*args,createId=False)) )

    def isVisible(self,drawable,intervalosView):
        viewIntervalX = intervalosView[0]
        viewIntervalY = intervalosView[1]

        intervaloQueryX= tuple([point[0] for point in drawable.calcHelperBox()])
        intervaloQueryY= tuple([point[1] for point in drawable.calcHelperBox()])

        return self.envision(intervaloQueryX,viewIntervalX) and self.envision(intervaloQueryY,viewIntervalY)

    def envision(self,queryInter,visInterval):
        #tres casos dentro de vision 0---1---1----0  o el caso 1-----0-------0-----1 o el caso 1------0------1
        #sean los 1 el cuadro de vision
        objetoContieneVista = lambda queryInter,visInterval : min(queryInter) <= min(visInterval) and max(visInterval) <= max(queryInter)
        vistaContieneObjeto =  lambda queryInter,visInterval  : (min(visInterval)  <= queryInter[0] <= max(visInterval)) or (min(visInterval)  <= queryInter[1] <= max(visInterval))

        return objetoContieneVista(queryInter,visInterval) or vistaContieneObjeto(queryInter,visInterval)


    def winfo_height(self):
        return self.vista.heigth
    def winfo_width(self):
        return self.vista.width

    def setLastDrawableInMemory(self,drawable):
        self.drawableInMemory=drawable

    #consigue todos los elementos en cuadrado
    def getSquare(self,p0,pf,tags=None):

        temp = []

        #consigue lista con intervalos en X dentro del cuadrado (o que pasen por este)
        #Debe ser siempre begin < end
        listaIntervalos = self.intervalTreeX.search(min(p0[0],pf[0]),max(p0[0],pf[0]))

        #esto te entrega lista tuplas ((x2,x2),idDrawable)
        for tupla in listaIntervalos:
            drawable= self.idToDrawable[tupla[2]]
            #Ahora descarta los que no sean consistentes respecto al intervalo Y
            intervaloY = tuple([point[1] for point in drawable.calcHelperBox()])
            if self.envision(intervaloY,(p0[1],pf[1])):
                temp.append(drawable)
        # print 'Elem without Filter ',str(temp)
        if not tags is None:
            return [elem for elem in temp if not self.getTagdrawable(elem) in tags]

        return temp



    """
    ---------------Funciones de creacion ------------------------------
    """
    def createLine(self,p0,pf,createId=True):
        self.logger.info('Creating line in %s %s',p0,pf)
        line = Line(self,self.vista,p0,pf)
        if createId:
            self.placeDrawable(line)
        return line

    def createRectangle(self,p0,pf,createId=True):
        self.logger.info('Creating rectangle in %s %s',p0,pf)
        rect = Rectangle(self,self.vista,p0,pf)
        if createId:
            self.placeDrawable(rect)
        return rect

    def createGroup(self,listaId=None,createId=True):
        self.logger.info('Creating Group from list %s',listaId)
        group = Group(self,self.vista)
        if not listaId is None:
            for id in listaId:
                group.add(self.idToDrawable[id])

        if createId:
            self.placeDrawable(group)
        return group

    def createText(self,p0,texto,createId=True):
        self.logger.info('Creating Text %s in %s',texto,p0)
        texto = TextDrawable(self,self.vista,p0,texto)
        if createId:
            self.placeDrawable(texto)
        return texto

    def createPointDraw(self,idGroup=None,createId=True):
        self.logger.info('Creating poinDraw from group %s',idGroup)
        pd = pointDraw(self,self.vista)
        if not idGroup is None:
            grupo = self.idToDrawable[idGroup]
            pd.addFromGroup(grupo)
        if createId:
            self.placeDrawable(pd)

        return pd

    def placeDrawable(self,drawable,id=None):
        self.logger.info('Placing drawable %s',str(drawable))
        if id is None:
            drawable.uniqueId = self.__getNewId()
        else:
            drawable.uniqueId = id
        drawable.draw()
        #ASEGURATE QUE LAS HELPERBOX ESTE BIEN HECHA
        helperBoxCords = drawable.calcHelperBox()
        # print 'helperbox ',helperBoxCords
        # print "helper yo interval ",helperBoxCords
        self.intervalTreeX.addi(helperBoxCords[0][0],helperBoxCords[1][0],drawable.uniqueId)
        self.idToInterval[drawable.uniqueId] = Interval(helperBoxCords[0][0],helperBoxCords[1][0],drawable.uniqueId)

        assert(self.idToInterval[drawable.uniqueId] == drawable.calcInterval())
        self.idToDrawable[drawable.uniqueId] = drawable


    def updatePosition(self,drawable):
        if self.idToDrawable.has_key(drawable.uniqueId):
            self.logger.info('Updating %s drawable %s ',drawable.uniqueId,str(drawable))
            try:
                self.intervalTreeX.remove(self.idToInterval[drawable.uniqueId])
            except Exception,e:
                print 'Error en borrar intervalo'
                self.logger.error('Cant remove interval %s exception %s',self.idToInterval[drawable.uniqueId],str(e))

            self.idToInterval.pop(drawable.uniqueId)

            helperBoxCords = drawable.calcHelperBox()
            self.intervalTreeX.addi(helperBoxCords[0][0],helperBoxCords[1][0],drawable.uniqueId)
            self.idToInterval[drawable.uniqueId] = Interval(helperBoxCords[0][0],helperBoxCords[1][0],drawable.uniqueId)
            assert(self.idToInterval[drawable.uniqueId] == drawable.calcInterval())

            self.logger.debug('New drawable interval %s %s %s ',helperBoxCords[0][0],helperBoxCords[1][0],drawable.uniqueId)

        else:
Exemple #19
0
    def filterOverlaps(self, overlapPercCutoff=70):
        """Filtering out amplicons that substantially overlap.
        The amplicon with the highest PPC with be kept.
        The MFEprimerRes attribute must be set.
        in-place edit of MFEprimerRes object (table object filtered of overlaps)
        
        Parmeters
        ---------
        overlapPercCutoff : float
            percent of overlap to consider 'substantially' overlapping
        """
        if self.MFEprimerRes is None:
            msg = 'genome object does not have MFEprimerRes attribute.' + \
                  ' Run MFEprimer() first'
            raise AttributeError, msg

        # making interval tree
        tree = IntervalTree()

        # loading intervals
        for count, row in self.MFEprimerRes.iterrows():
            # sanity check for + strand
            if row['BindingStart'] > row['BindingStop']:
                raise TypeError('MFEprimer binding start-stop is not + strand')
            tree.addi(row['BindingStart'], row['BindingStop'],
                      [count, row['PPC'], row['Size']])

        # finding all that substantially overlap; keeping one with > PPC
        tree2 = tree.copy()
        for iv1 in tree.iter():
            # skipping if already removed from tree2
            if not iv1 in tree2:
                continue

            overlaps = tree.search(iv1.begin, iv1.end)

            # skipping those that poorly overlap
            lowOverlap = set()
            for iv2 in overlaps:
                if iv1.overlaps(iv2):
                    percOverlaps = self._calcPercOverlap(iv1, iv2)
                    if percOverlaps[0] < overlapPercCutoff:
                        lowOverlap.add(iv2)
            overlaps = overlaps - lowOverlap  # just list of substantially overlapping

            # skipping those that have been already removed
            prevRm = set([x for x in overlaps if x not in tree2])
            overlaps = overlaps - prevRm

            # removing all substantially overlapping intervals with lower PPC
            if len(overlaps) > 1:
                overlaps = sorted(overlaps,
                                  key=lambda x: x.data[1],
                                  reverse=True)
                for o in overlaps[1:]:
                    if o in tree2:
                        tree2.remove(o)
            else:
                pass

        # selecting columns
        iv_idx = [x.data[0] for x in tree2.iter()]
        self.MFEprimerRes = self.MFEprimerRes.iloc[iv_idx]
Exemple #20
0
class BloodSugarSimulator:

	def __init__(self, input_file, fooddb_filename, exerdb_filename,
				 duration=1800):
		self.input_file = input_file
		self.fooddb_filename = fooddb_filename
		self.exerdb_filename = exerdb_filename
		
		# Initialize blood sugar count to 80.
		self.blood_sugar_count = 80
		self.glycation_threshold = 150
		self.int_tree = None
		self.ts1 = 0.0
		self.ts2 = 0.0
		# glycation
		self.glycation = 0
		# change interval for plotting graph.
		# default set to 1800s or 30 minutes.
		self.duration = duration
		# Internal data structures used.
		self.x = []
		self.y = []
		self.x1 = []
		self.y1 = []
		self.food_dict = dict()
		self.exer_dict = dict()
		self.food_db = None
		self.exer_db = None
						
	# convert to epoch time
	@staticmethod
	def get_date_time_ts(dt, tm):
		d_fields = dt.split('-')
		tm_fields = tm.split(':')
		t = datetime.datetime(int(d_fields[2]), int(d_fields[0]), int(d_fields[1]),
						  	  int(tm_fields[0]), int(tm_fields[1]))
		return float(time.mktime(t.timetuple()))
	
	# convert to timestr
	@staticmethod
	def get_date_time_hhmm(ts):
		return datetime.datetime.fromtimestamp(ts)

	# load csv into tuple
	@staticmethod
	def load_db_dict (filename):
		dbdict = dict()
		try:
			with open(filename, 'r') as f:
  				reader = csv.reader(f)
  				db = tuple(reader)	
  			if not db:
				raise Exception("No data found in file %s" % filename)		
		
			# build hash table 
			for tup in db:
				if len(tup) !=	3:
					raise Exception("CSV file %s have missing fields." % filename)
				dbdict[tup[0]] = [tup[1], tup[2]]
		
		except Exception as e:
			if isinstance(e, IOError):
				print "Failed to open the input file %s" % filename
			else:
				print "Exception occured %s" % str(e)
			sys.exit(1)
		
		return dbdict

	def load_food_exer_files(self):
		self.food_dict= BloodSugarSimulator.load_db_dict (self.fooddb_filename)
		# get exercise data
		self.exer_dict = BloodSugarSimulator.load_db_dict (self.exerdb_filename)
		
	# create interval tree 
	def create_int_tree(self):
		self.int_tree = IntervalTree()
		currentdate = None
		try:
			# create interval tree from input file activity.out
			fd = open(self.input_file, 'r')
		
			for line in fd:
				fields = line.split()
		
				#set current date for input
				if (currentdate):
					# input can be only for same day
					if (currentdate != fields[0]):
						print "Usage: Enter data for same day only"
						sys.exit(1)
				else:
					# initialise current date
					currentdate = fields[0]	
					# initialise ts1 to beginning of day 9 am
					self.ts1 = BloodSugarSimulator.get_date_time_ts(fields[0], '9:00')
					# initialise ts2 to end of day 7 pm
					self.ts2 = BloodSugarSimulator.get_date_time_ts(fields[0], '19:00')
		
				# check input is Food or Exercise	
				if fields[2] == 'F':
					begin = BloodSugarSimulator.get_date_time_ts(fields[0], fields[1])
					# end time for food is 2 hours.
					end = begin + 7200
					glycemicIndex = self.food_dict[fields[3]][1]
					# check formula 
					data = round((float(glycemicIndex) / 120.0), 2)
					self.int_tree[begin:end] = data
				elif fields[2] == 'E':
					begin = BloodSugarSimulator.get_date_time_ts(fields[0], fields[1])
					# end time for exercise is 1 hour.
					end = begin + 3600
					exerIndex = self.exer_dict[fields[3]][1]
					data = round((float(exerIndex) / 60.0), 2)
					self.int_tree[begin:end] = -data
				else:
					print "Usage: Input can only be of type F or E"
					sys.exit(1)
					
			#first point for blood sugar graph
			self.y.append(self.blood_sugar_count)		
			customdate = BloodSugarSimulator.get_date_time_hhmm(self.ts1)
			self.x.append(customdate)
			print customdate
			
			#first point for glycation graph
			self.y1.append(self.glycation)
			self.x1.append(customdate)		
	
		except Exception as e:
			if isinstance(e, IOError):
				print "Failed to open the input file %s" % input_file
			if isinstance(e, KeyError):
				print "Invalid ID  %s" % str(e)
			else:
				print "Exception occured %s" % str(e)
			sys.exit(1)
	
	
	# create interval tree 
	def compute_values(self):
		# no of seconds 
		secs = 0
		curr_ts = self.ts1
		blood_sugar_count = self.blood_sugar_count
		glycation = self.glycation
		try:
			#while not end of day
			while curr_ts < self.ts2:
				#look up intervals for current timestamp
				ivs = self.int_tree.search(curr_ts)
				if not ivs:
					if (blood_sugar_count - 1) > self.blood_sugar_count:
						blood_sugar_count -= 1
					elif (blood_sugar_count + 1) < self.blood_sugar_count:
						blood_sugar_count += 1
					else:
						blood_sugar_count = self.blood_sugar_count
				else:
					for iv in ivs:
						blood_sugar_count += iv.data
				#compute glycation
				if blood_sugar_count > self.glycation_threshold :
					glycation = glycation + 1
			
				#enter data every 30 mins for graphs
				if ( secs % self.duration == 0):
					#print "%s -> %s -> %s" %(time.strftime('%m-%d-%Y %H:%M', 
					#				time.localtime(ts1)), blood_sugar_count, glycation) 
					#blood sugar
					self.y.append(blood_sugar_count)		
					customdate = BloodSugarSimulator.get_date_time_hhmm(curr_ts)
					self.x.append(customdate)
			
					#glycation
					self.y1.append(glycation)
					self.x1.append(customdate)
					
					
				#compute values every minute
				curr_ts += 60
				secs += 60
		except Exception as e:
			print "Exception occured %s" % str(e)
			sys.exit(1)
	

	def plot_graph_blood_sugar(self):
		b = plt.figure(1)
		formatter = DateFormatter('%H:%M')
		plt.plot(self.x,self.y)
		plt.gcf().axes[0].xaxis.set_major_locator(MinuteLocator(interval  = 30))   
		plt.gcf().axes[0].xaxis.set_major_formatter(formatter)
		plt.xlabel('Time')
		plt.ylabel('Blood Sugar')
		plt.title('Blood Sugar graph for 1 day')
		plt.xticks(rotation='vertical')
		b.show()

	def plot_graph_glycation(self):
		b = plt.figure(2)
		formatter = DateFormatter('%H:%M')
		plt.plot(self.x1,self.y1)
		plt.gcf().axes[0].xaxis.set_major_locator(MinuteLocator(interval  = 30))   
		plt.gcf().axes[0].xaxis.set_major_formatter(formatter)
		plt.xlabel('Time')
		plt.ylabel('Glycation Index')
		plt.title('Glycation Index graph for 1 day')
		plt.xticks(rotation='vertical')
		b.show()
tree = IntervalTree()
seq_dict = {}
for multiple_alignment in AlignIO.parse(maf_file, "maf"):
    multiple_alignment = list(multiple_alignment)
    id = multiple_alignment[1].id
    start = multiple_alignment[0].annotations["start"]
    end = start + multiple_alignment[0].annotations["size"]
    tree[start:end] = id
    seq_dict[id] = (start, end)

fasta_output = []
with open(new_overlap_txt, "w") as fout:
    overlap_dict = {}
    for seq_id in seq_list:
        fasta_output.append(record_dict[seq_id])
        #print(seq_id)
        overlap_list = list(
            tree.search(seq_dict[seq_id][0], seq_dict[seq_id][1]))
        for overlap_rec in overlap_list:
            if overlap_rec.data != seq_id:
                target_id = overlap_rec.data
                x = range(seq_dict[seq_id][0], seq_dict[seq_id][1])
                y = range(overlap_rec.begin, overlap_rec.end)
                overlap_len = len(set(x) & set(y))
                if overlap_len >= upper_overlap:
                    overlap_dict[(seq_id, target_id)] = True
                    print("{}\t{}\t{}".format(seq_id, target_id, overlap_len),
                          file=fout)

SeqIO.write(fasta_output, save_fasta, "fasta")
            begin = get_date_time_ts(fields[0], fields[1])
            # end time for food is 2 hours.
            end = begin + 7200
            # check formula
            data = round((float(fields[3]) / 120.0), 2)
            int_tree[begin:end] = data
        elif fields[2] == 'E':
            begin = get_date_time_ts(fields[0], fields[1])
            # end time for exercise is 1 hour.
            end = begin + 3600
            data = round((float(fields[3]) / 60.0), 2)
            int_tree[begin:end] = -data
    #print int_tree
    print "Blood Sugar Graph:"
    while ts1 < ts2:
        ivs = int_tree.search(ts1)
        if not ivs:
            if (blood_sugar_count - 1) > 80:
                blood_sugar_count -= 1
            elif (blood_sugar_count + 1) < 80:
                blood_sugar_count += 1
            else:
                blood_sugar_count = 80
        else:
            for iv in ivs:
                blood_sugar_count += iv.data

        if blood_sugar_count > 150:
            glycation += 1

        if (count % 1800 == 0):
Exemple #23
0
                deletiontree.removei(curr_deletion['start'],
                                     curr_deletion['end'], curr_deletion)
                curr_deletion['end'] = end
                deletiontree[curr_deletion['start']:
                             curr_deletion['end']] = curr_deletion
            else:
                curr_deletion['start'] = start
                curr_deletion['end'] = end
                curr_deletion['part'] = block
                deletiontree[curr_deletion['start']:
                             curr_deletion['end']] = curr_deletion

with open(flank5k, 'w') as flanking_outfile:
    with open(exact, 'w') as exact_outfile:
        for Iobj in sorted(insertiontree):
            Dset = sorted(deletiontree.search(Iobj.begin, Iobj.end))
            closeset = sorted(
                deletiontree.search(Iobj.begin - flanking,
                                    Iobj.end + flanking))
            for closeobj in closeset:
                if closeobj in Dset:
                    maf_iterate.print_block(Iobj.data['part1'],
                                            flanking_outfile)
                    maf_iterate.print_block(Iobj.data['part2'],
                                            flanking_outfile)
                    maf_iterate.print_block(closeobj.data['part'],
                                            flanking_outfile)
                else:
                    maf_iterate.print_block(Iobj.data['part1'], exact_outfile)
                    maf_iterate.print_block(Iobj.data['part2'], exact_outfile)
                    maf_iterate.print_block(closeobj.data['part'],
Exemple #24
0
    def _remove_overlaps(self, position_idy: IntervalTree, percents: dict):
        while len(position_idy) > 0:
            item = position_idy.pop()
            start = item.begin
            end = item.end
            cat = item.data
            overlaps = position_idy.search(start, end)
            if len(overlaps) > 0:
                has_overlap = False
                for overlap in overlaps:
                    if has_overlap:
                        break
                    o_start = overlap.begin
                    o_end = overlap.end
                    o_cat = overlap.data
                    if not position_idy.containsi(o_start, o_end, o_cat):
                        continue
                    if start < o_start:
                        if end <= o_end:
                            # cccccccccccccc*******
                            # *****ooooooooo[ooooooo]
                            if o_cat < cat:
                                if end < o_end:
                                    # No overlap with the current item, we stay has_overlap as False
                                    position_idy.discard(overlap)
                                    position_idy[end:o_end] = o_cat
                                else:
                                    position_idy.discard(
                                        overlap)  # No kept overlap
                            elif o_cat == cat:
                                if end < o_end:
                                    has_overlap = True
                                    position_idy.discard(overlap)
                                    position_idy[start:o_end] = cat
                                else:
                                    position_idy.discard(
                                        overlap)  # No kept overlap
                            else:
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[start:o_start] = cat
                                position_idy[o_start:o_end] = o_cat
                        else:  # end > o_end
                            # ccccccccccccccccccc
                            # *****oooooooooo****
                            if o_cat <= cat:
                                position_idy.discard(
                                    overlap)  # No kept overlap
                            else:  # o_cat > cat
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[start:o_start] = cat
                                position_idy[o_start:o_end] = o_cat
                                position_idy[o_end:end] = cat
                    elif start == o_start:
                        if end < o_end:
                            # cccccccccccc*******
                            # ooooooooooooooooooo
                            if o_cat < cat:
                                # No overlap with the current item, we stay has_overlap as False
                                position_idy.discard(overlap)
                                position_idy[end:o_end] = o_cat
                            elif o_cat == cat:
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[start:o_end] = cat
                            else:  # o_cat > cat
                                # The overlap just contains current item
                                has_overlap = True
                        elif end == o_end:
                            # ***cccccccccccccccc***
                            # ***oooooooooooooooo***
                            if o_cat <= cat:
                                position_idy.discard(
                                    overlap)  # No kept overlap
                            else:
                                # The overlap just contains current item
                                has_overlap = True
                        else:  # end > o_end
                            # ccccccccccccccccccccccccccccc
                            # oooooooooooooooooooo*********
                            if o_cat <= cat:
                                # current item just contains the overlap
                                position_idy.discard(overlap)
                            else:
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[o_start:o_end] = o_cat
                                position_idy[o_end:end] = cat
                    else:  # start > o_start
                        if end <= o_end:
                            # ******ccccccccc*******
                            # ooooooooooooooo[ooooooo]
                            if o_cat < cat:
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[o_start:start] = o_cat
                                position_idy[start:end] = cat
                                if end < o_end:
                                    position_idy[end:o_end] = o_cat
                            else:  # o_cat >= cat
                                # Overlap just contains the item
                                has_overlap = True
                        else:  # end > o_end
                            # ******ccccccccccccccccccccc
                            # ooooooooooooooooo**********
                            if o_cat < cat:
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[o_start:start] = o_cat
                                position_idy[start:end] = cat
                            elif o_cat == cat:
                                has_overlap = True
                                position_idy.discard(overlap)
                                position_idy[o_start:end] = cat
                            else:  # o_cat > cat
                                has_overlap = True
                                position_idy[o_end:end] = cat
                if not has_overlap:
                    percents = self._add_percents(percents, item)

            else:
                percents = self._add_percents(percents, item)

        return percents
Exemple #25
0
class MemoryCache(object):
    def __init__(self, context):
        self._context = context
        self._run_token = -1
        self._log = logging.getLogger('memcache')
        self._reset_cache()

    def _reset_cache(self):
        self._cache = IntervalTree()
        self._metrics = CacheMetrics()

    ##
    # @brief Invalidates the cache if appropriate.
    def _check_cache(self):
        if self._context.core.is_running():
            self._log.debug("core is running; invalidating cache")
            self._reset_cache()
        elif self._run_token != self._context.core.run_token:
            self._dump_metrics()
            self._log.debug("out of date run token; invalidating cache")
            self._reset_cache()
            self._run_token = self._context.core.run_token

    ##
    # @brief Splits a memory address range into cached and uncached subranges.
    # @return Returns a 2-tuple with the first element being a set of Interval objects for each
    #   of the cached subranges. The second element is a set of Interval objects for each of the
    #   non-cached subranges.
    def _get_ranges(self, addr, count):
        cached = self._cache.search(addr, addr + count)
        uncached = {Interval(addr, addr + count)}
        for cachedIv in cached:
            newUncachedSet = set()
            for uncachedIv in uncached:

                # No overlap.
                if cachedIv.end < uncachedIv.begin or cachedIv.begin > uncachedIv.end:
                    newUncachedSet.add(uncachedIv)
                    continue

                # Begin segment.
                if cachedIv.begin - uncachedIv.begin > 0:
                    newUncachedSet.add(
                        Interval(uncachedIv.begin, cachedIv.begin))

                # End segment.
                if uncachedIv.end - cachedIv.end > 0:
                    newUncachedSet.add(Interval(cachedIv.end, uncachedIv.end))
            uncached = newUncachedSet
        return cached, uncached

    ##
    # @brief Reads uncached memory ranges and updates the cache.
    # @return A list of Interval objects is returned. Each Interval has its @a data attribute set
    #   to a bytearray of the data read from target memory.
    def _read_uncached(self, uncached):
        uncachedData = []
        for uncachedIv in uncached:
            data = self._context.read_memory_block8(
                uncachedIv.begin, uncachedIv.end - uncachedIv.begin)
            iv = Interval(uncachedIv.begin, uncachedIv.end, bytearray(data))
            self._cache.add(iv)  # TODO merge contiguous cached intervals
            uncachedData.append(iv)
        return uncachedData

    def _update_metrics(self, cached, uncached, addr, size):
        cachedSize = 0
        for iv in cached:
            begin = iv.begin
            end = iv.end
            if iv.begin < addr:
                begin = addr
            if iv.end > addr + size:
                end = addr + size
            cachedSize += end - begin

        uncachedSize = sum((iv.end - iv.begin) for iv in uncached)

        self._metrics.reads += 1
        self._metrics.hits += cachedSize
        self._metrics.misses += uncachedSize

    def _dump_metrics(self):
        if self._metrics.total > 0:
            self._log.debug(
                "%d reads, %d bytes [%d%% hits, %d bytes]; %d bytes written",
                self._metrics.reads, self._metrics.total,
                self._metrics.percent_hit, self._metrics.hits,
                self._metrics.writes)
        else:
            self._log.debug("no reads")

    ##
    # @brief Performs a cached read operation of an address range.
    # @return A list of Interval objects sorted by address.
    def _read(self, addr, size):
        # Get the cached and uncached subranges of the requested read.
        cached, uncached = self._get_ranges(addr, size)
        self._update_metrics(cached, uncached, addr, size)

        # Read any uncached ranges.
        uncachedData = self._read_uncached(uncached)

        # Merged cached with data we just read
        combined = list(cached) + uncachedData
        combined.sort(key=lambda x: x.begin)
        return combined

    ##
    # @brief Extracts data from the intersection of an address range across a list of interval objects.
    #
    # The range represented by @a addr and @a size are assumed to overlap the intervals. The first
    # and last interval in the list may have ragged edges not fully contained in the address range, in
    # which case the correct slice of those intervals is extracted.
    #
    # @param self
    # @param combined List of Interval objects forming a contiguous range. The @a data attribute of
    #   each interval must be a bytearray.
    # @param addr Start address. Must be within the range of the first interval.
    # @param size Number of bytes. (@a addr + @a size) must be within the range of the last interval.
    # @return A single bytearray object with all data from the intervals that intersects the address
    #   range.
    def _merge_data(self, combined, addr, size):
        result = bytearray()
        resultAppend = bytearray()

        # Take slice of leading ragged edge.
        if len(combined) and combined[0].begin < addr:
            offset = addr - combined[0].begin
            result += combined[0].data[offset:]
            combined = combined[1:]
        # Take slice of trailing ragged edge.
        if len(combined) and combined[-1].end > addr + size:
            offset = addr + size - combined[-1].begin
            resultAppend = combined[-1].data[:offset]
            combined = combined[:-1]

        # Merge.
        for iv in combined:
            result += iv.data
        result += resultAppend

        return result

    ##
    # @brief
    def _update_contiguous(self, cached, addr, value):
        size = len(value)
        end = addr + size
        leadBegin = addr
        leadData = bytearray()
        trailData = bytearray()
        trailEnd = end

        if cached[0].begin < addr and cached[0].end > addr:
            offset = addr - cached[0].begin
            leadData = cached[0].data[:offset]
            leadBegin = cached[0].begin
        if cached[-1].begin < end and cached[-1].end > end:
            offset = end - cached[-1].begin
            trailData = cached[-1].data[offset:]
            trailEnd = cached[-1].end

        self._cache.remove_overlap(addr, end)

        data = leadData + value + trailData
        self._cache.addi(leadBegin, trailEnd, data)

    ##
    # @return A bool indicating whether the given address range is fully contained within
    #       one known memory region, and that region is cacheable.
    # @exception MemoryAccessError Raised if the access is not entirely contained within a single region.
    def _check_regions(self, addr, count):
        regions = self._context.core.memory_map.get_intersecting_regions(
            addr, length=count)

        # If no regions matched, then allow an uncached operation.
        if len(regions) == 0:
            return False

        # Raise if not fully contained within one region.
        if len(regions) > 1 or not regions[0].contains_range(addr,
                                                             length=count):
            raise MemoryAccessError(
                "individual memory accesses must not cross memory region boundaries"
            )

        # Otherwise return whether the region is cacheable.
        return regions[0].is_cacheable

    def read_memory(self, addr, transfer_size=32, now=True):
        # TODO use more optimal underlying read_memory call
        if transfer_size == 8:
            data = self.read_memory_block8(addr, 1)[0]
        elif transfer_size == 16:
            data = conversion.byte_list_to_u16le_list(
                self.read_memory_block8(addr, 2))[0]
        elif transfer_size == 32:
            data = conversion.byte_list_to_u32le_list(
                self.read_memory_block8(addr, 4))[0]

        if now:
            return data
        else:

            def read_cb():
                return data

            return read_cb

    def read_memory_block8(self, addr, size):
        if size <= 0:
            return []

        self._check_cache()

        # Validate memory regions.
        if not self._check_regions(addr, size):
            self._log.debug("range [%x:%x] is not cacheable", addr,
                            addr + size)
            return self._context.read_memory_block8(addr, size)

        # Get the cached and uncached subranges of the requested read.
        combined = self._read(addr, size)

        # Extract data out of combined intervals.
        result = list(self._merge_data(combined, addr, size))
        return result

    def read_memory_block32(self, addr, size):
        return conversion.byte_list_to_u32le_list(
            self.read_memory_block8(addr, size * 4))

    def write_memory(self, addr, value, transfer_size=32):
        if transfer_size == 8:
            return self.write_memory_block8(addr, [value])
        elif transfer_size == 16:
            return self.write_memory_block8(
                addr, conversion.u16le_list_to_byte_list([value]))
        elif transfer_size == 32:
            return self.write_memory_block8(
                addr, conversion.u32le_list_to_byte_list([value]))

    def write_memory_block8(self, addr, value):
        if len(value) <= 0:
            return

        self._check_cache()

        # Validate memory regions.
        cacheable = self._check_regions(addr, len(value))

        # Write to the target first, so if it fails we don't update the cache.
        result = self._context.write_memory_block8(addr, value)

        if cacheable:
            size = len(value)
            end = addr + size
            cached = sorted(self._cache.search(addr, end),
                            key=lambda x: x.begin)
            self._metrics.writes += size

            if len(cached):
                # Write data is entirely within cached data.
                if addr >= cached[0].begin and end <= cached[0].end:
                    beginOffset = addr - cached[0].begin
                    endOffset = end - cached[0].end
                    cached[0].data[beginOffset:endOffset] = value

                else:
                    self._update_contiguous(cached, addr, bytearray(value))
            else:
                # No cached data in this range, so just add the entire interval.
                self._cache.addi(addr, end, bytearray(value))

        return result

    def write_memory_block32(self, addr, data):
        return self.write_memory_block8(
            addr, conversion.u32le_list_to_byte_list(data))

    def invalidate(self):
        self._reset_cache()
Exemple #26
0
            #print(ichr)
            start = int(L[1]) + gstart[ichr]
            end = int(L[2]) + gstart[ichr]
            tr.addi(start, end, 100)
f.close()
t2 = time.time()
print("Tree build time: ", t2 - t1)
#print(tr.items())
Total = 0
with open(qfile) as f:
    for line in f:
        L = line.strip().split()
        if len(L[0]) < 6 and L[0][3] != 'M':
            if L[0][3] == 'X':
                ichr = 22
            elif L[0][3] == 'Y':
                ichr = 23
            else:
                ichr = int(L[0][3:]) - 1
            start = int(L[1]) + gstart[ichr]
            end = int(L[2]) + gstart[ichr]
            ols = tr.search(start, end)
            if len(ols) > 0:
                Total += len(ols)
                #print(start, ", ", end-1, ":")
                #print(ols, "\n")
f.close()
print("Total: ", Total)
t3 = time.time()
print("Tree search time: ", t3 - t2)
Exemple #27
0
class FlashReaderContext(DebugContext):
    def __init__(self, parentContext, elf):
        super(FlashReaderContext, self).__init__(parentContext.core)
        self._parent = parentContext
        self._elf = elf
        self._log = logging.getLogger('flashreadercontext')

        self._build_regions()

    def _build_regions(self):
        self._tree = IntervalTree()
        for sect in [
                s for s in self._elf.sections
                if (s.region and s.region.is_flash)
        ]:
            start = sect.start
            length = sect.length
            sect.data  # Go ahead and read the data from the file.
            self._tree.addi(start, start + length, sect)
            self._log.debug("created flash section [%x:%x] for section %s",
                            start, start + length, sect.name)

    def read_memory(self, addr, transfer_size=32, now=True):
        length = transfer_size // 8
        matches = self._tree.search(addr, addr + length)
        # Must match only one interval (ELF section).
        if len(matches) != 1:
            return self._parent.read_memory(addr, transfer_size, now)
        section = matches.pop().data
        addr -= section.start

        def read_memory_cb():
            self._log.debug("read flash data [%x:%x] from section %s",
                            section.start + addr,
                            section.start + addr + length, section.name)
            data = section.data[addr:addr + length]
            if transfer_size == 8:
                return data[0]
            elif transfer_size == 16:
                return conversion.byte_list_to_u16le_list(data)[0]
            elif transfer_size == 32:
                return conversion.byte_list_to_u32le_list(data)[0]
            else:
                raise ValueError("invalid transfer_size (%d)" % transfer_size)

        if now:
            return read_memory_cb()
        else:
            return read_memory_cb

    def read_memory_block8(self, addr, size):
        matches = self._tree.search(addr, addr + size)
        # Must match only one interval (ELF section).
        if len(matches) != 1:
            return self._parent.read_memory_block8(addr, size)
        section = matches.pop().data
        addr -= section.start
        data = section.data[addr:addr + size]
        self._log.debug("read flash data [%x:%x]", section.start + addr,
                        section.start + addr + size)
        return list(data)

    def read_memory_block32(self, addr, size):
        return conversion.byte_list_to_u32le_list(
            self.read_memory_block8(addr, size))

    def write_memory(self, addr, value, transfer_size=32):
        return self._parent.write_memory(addr, value, transfer_size)

    def write_memory_block8(self, addr, value):
        return self._parent.write_memory_block8(addr, value)

    def write_memory_block32(self, addr, data):
        return self._parent.write_memory_block32(addr, data)
                # create packet ranges
                for db_packet in db_packets:
                    movie_body_sizes.add(db_packet[0])

            # aggregate
            for body_size in movie_body_sizes:
                start = (1 - epsilon) * body_size
                end = (1 + epsilon) * body_size

                if start == end:
                    if start not in body_sizes_dict:
                        body_sizes_dict[start] = 0
                    body_sizes_dict[start] += 1
                else:
                    res = body_sizes_tree.search(3, 5, strict=True)
                    elem = None
                    if len(res) == 0:
                        elem = Interval(start, end, IntContainer())
                        body_sizes_tree.add(elem)
                    else:
                        for entry in res:
                            elem = entry.data

                    elem.data.private += 1

        # prepare collision dict
        collisions = {}

        # sum up collisions in tree
        for interval in body_sizes_tree:
Exemple #29
0
class FeatureSet(object):
    """
    An ordered collection of :class:`SeqFeature` objects.

    :param type feature_class: type of the features stored in the collection; defaults to :class:`SeqFeature` and must
        inherit from it.
    """
    def __init__(self, feature_class=None):
        if feature_class is None:
            feature_class = SeqFeature
        elif not issubclass(feature_class, SeqFeature):
            raise RuntimeError(
                "FeatureSet expects a feature class that inherits from SeqFeature"
            )

        self._features = IntervalTree()
        self._feature_class = feature_class

    def __or__(self, other):
        return self.difference(other)

    def __len__(self):
        return len(self._features)

    def __iter__(self):
        for f in sorted(self._features):
            yield f.data

    def __repr__(self):
        return '{}({})'.format(self.__class__.__name__, list(self))

    def _wrap_feature(self, feature):
        if isinstance(feature, SeqFeature):
            return Interval(feature.location.start, feature.location.end,
                            feature)
        elif isinstance(feature, (self._feature_class, Feature)):
            return Interval(feature.start, feature.end, feature)
        else:
            raise ValueError(
                "feature must be one of Bio.SeqFeature, co.Feature, %s" %
                self._feature_class)

    def copy(self):
        """
        :returns: a copy of this collection
        :rtype: :class:`FeatureSet`
        """
        fs = FeatureSet(feature_class=self._feature_class)
        fs._features = self._features.copy()
        return fs

    def add(self, *args, **kwargs):
        """
        Creates a feature object from the given ``args`` and ``kwargs`` and adds it to the collection.

        :rtype: :class:`SeqFeature`
        """
        feature = self._feature_class(*args, **kwargs)

        self._features.add(self._wrap_feature(feature))
        return feature

    def remove(self, feature):
        """
        Removes the given feature from the collection
        """
        self._features.remove(self._wrap_feature(feature))

    def find(self,
             between_start=None,
             between_end=None,
             type=None,
             id=None,
             strand=None,
             **qualifiers):
        """
        Iterate over all features matching the search parameters.

        - ``between_start`` and ``between_end`` can be used to restrict the search range.
        - ``type``, ``id``, and ``strand`` each restrict the search to features that match on these attributes
        - ``qualifiers`` is an arbitrary group of keyword arguments that will be matched to the qualifier keys of
          each feature. Each key must be present and have the same value as in the search parameters.

        """

        if between_start or between_end:
            it = self.overlap(between_start or 0, between_end or sys.maxsize)
        else:
            it = iter(self)

        attrs = [(k, v)
                 for k, v in (('type', type), ('id', id), ('strand', strand))
                 if v is not None]

        for feature in it:
            if any(getattr(feature, key) != value for key, value in attrs):
                continue
            if any(
                    feature.qualifiers.get(key) != value
                    for key, value in qualifiers.items()):
                continue
            yield feature

    def overlap(self, start, end):
        """
        Returns an iterator over all features in the collection that overlap the given range.

        :param int start: overlap region start
        :param int end: overlap region end
        """
        if start > end:
            raise RuntimeError("start cannot be larger than end.")

        for f in sorted(self._features.search(start, end + 1)):
            yield f.data

    def difference(self, other):
        fs = self.copy()
        fs._features = self._features - other._features
        return fs

    def union(self, other):
        fs = self.copy()
        fs._features = self._features | other._features
        return fs
def test_all():
    from intervaltree import Interval, IntervalTree
    from pprint import pprint
    from operator import attrgetter
    
    def makeinterval(lst):
        return Interval(
            lst[0], 
            lst[1], 
            "{}-{}".format(*lst)
            )
    
    ivs = list(map(makeinterval, [
        [1,2],
        [4,7],
        [5,9],
        [6,10],
        [8,10],
        [8,15],
        [10,12],
        [12,14],
        [14,15],
        ]))
    t = IntervalTree(ivs)
    t.verify()
    
    def data(s): 
        return set(map(attrgetter('data'), s))
    
    # Query tests
    print('Query tests...')
    assert data(t[4])          == set(['4-7'])
    assert data(t[4:5])        == set(['4-7'])
    assert data(t[4:6])        == set(['4-7', '5-9'])
    assert data(t[9])          == set(['6-10', '8-10', '8-15'])
    assert data(t[15])         == set()
    assert data(t.search(5))   == set(['4-7', '5-9'])
    assert data(t.search(6, 11, strict = True)) == set(['6-10', '8-10'])
    
    print('    passed')
    
    # Membership tests
    print('Membership tests...')
    assert ivs[1] in t
    assert Interval(1,3, '1-3') not in t
    assert t.overlaps(4)
    assert t.overlaps(9)
    assert not t.overlaps(15)
    assert t.overlaps(0,4)
    assert t.overlaps(1,2)
    assert t.overlaps(1,3)
    assert t.overlaps(8,15)
    assert not t.overlaps(15, 16)
    assert not t.overlaps(-1, 0)
    assert not t.overlaps(2,4)
    print('    passed')
    
    # Insertion tests
    print('Insertion tests...')
    t.add( makeinterval([1,2]) )  # adding duplicate should do nothing
    assert data(t[1])        == set(['1-2'])
    
    t[1:2] = '1-2'                # adding duplicate should do nothing
    assert data(t[1])        == set(['1-2'])
    
    t.add(makeinterval([2,4]))
    assert data(t[2])        == set(['2-4'])
    t.verify()
    
    t[13:15] = '13-15'
    assert data(t[14])       == set(['8-15', '13-15', '14-15'])
    t.verify()
    print('    passed')
    
    # Duplication tests
    print('Interval duplication tests...')
    t.add(Interval(14,15,'14-15####'))
    assert data(t[14])        == set(['8-15', '13-15', '14-15', '14-15####'])
    t.verify()
    print('    passed')
    
    # Copying and casting
    print('Tree copying and casting...')
    tcopy = IntervalTree(t)
    tcopy.verify()
    assert t == tcopy
    
    tlist = list(t)
    for iv in tlist:
        assert iv in t
    for iv in t:
        assert iv in tlist
    
    tset = set(t)
    assert tset == t.items()
    print('    passed')
    
    # Deletion tests
    print('Deletion tests...')
    try:
        t.remove(
            Interval(1,3, "Doesn't exist")
            )
    except ValueError:
        pass
    else:
        raise AssertionError("Expected ValueError")
    
    try:
        t.remove(
            Interval(500, 1000, "Doesn't exist")
            )
    except ValueError:
        pass
    else:
        raise AssertionError("Expected ValueError")
    
    orig = t.print_structure(True)
    t.discard( Interval(1,3, "Doesn't exist") )
    t.discard( Interval(500, 1000, "Doesn't exist") )
    
    assert data(t[14])        == set(['8-15', '13-15', '14-15', '14-15####'])
    t.remove( Interval(14,15,'14-15####') )
    assert data(t[14])        == set(['8-15', '13-15', '14-15'])
    t.verify()
    
    assert data(t[2])        == set(['2-4'])
    t.discard( makeinterval([2,4]) )
    assert data(t[2])        == set()
    t.verify()
    
    assert t[14]
    t.remove_overlap(14)
    t.verify()
    assert not t[14]
    
    # Emptying the tree
    #t.print_structure()
    for iv in sorted(iter(t)):
        #print('### Removing '+str(iv)+'... ###')
        t.remove(iv)
        #t.print_structure()
        t.verify()
        #print('')
    assert len(t) == 0
    assert t.is_empty()
    assert not t
    
    t = IntervalTree(ivs)
    #t.print_structure()
    t.remove_overlap(1)
    #t.print_structure()
    t.verify()
    
    t.remove_overlap(8)
    #t.print_structure()    
    print('    passed')
    
    t = IntervalTree(ivs)
    pprint(t)
    t.split_overlaps()
    pprint(t)
    #import cPickle as pickle
    #p = pickle.dumps(t)
    #print(p)
    
from datetime import datetime, date
from intervaltree import IntervalTree

class ScheduleItem:
    def __init__(self, course_number, start_time, end_time):
        self.course_number = course_number
        self.start_time = start_time
        self.end_time = end_time
    def get_begin(self):
        return minutes_from_midnight(self.start_time)
    def get_end(self):
        return minutes_from_midnight(self.end_time)
    def __repr__(self):
        return ''.join(["{ScheduleItem: ", str((self.course_number, self.start_time, self.end_time)), "}"])

def minutes_from_midnight(time):
    str_time = datetime.strptime(time, '%I:%M%p').time()
    midnight = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
    return int((datetime.combine(date.today(), str_time) - midnight).total_seconds()/60)

T = IntervalTree([ScheduleItem(28374, "9:00AM", "10:00AM"), \
                  ScheduleItem(43564, "8:00AM", "12:00PM"), \
                  ScheduleItem(53453, "1:00AM", "2:00AM")])
print T.search(minutes_from_midnight("9:00PM"), minutes_from_midnight("10:00PM"))
Exemple #32
0
def interval_intersect_interval(**kwargs):
    '''
    Efficient algorithm to find which intervals intersect

    Handles both unix timestamp or datetime object

    Return:
    -------

    prediction_gt: 
        array with same size as prediction,
        will be 1 if there's an overlapping label
        0 if not
    recall:
        recall percentage of labels
    overlap:
        how much overlap between label and prediction
    '''

    gt = kwargs['groundtruth']
    pred = kwargs['prediction']

    total_overlap = None
    missed = None
    false_alarm = None

    # calculate recall
    tree = IntervalTree()
    for segment in pred:
        tree.add(Interval(segment[0], segment[1]))

    TP = 0
    for segment in gt:
        overlap = tree.search(segment[0], segment[1])

        if len(overlap) != 0:
            TP += 1

    recall = TP / len(gt)

    # calculate precision
    tree = IntervalTree()
    for segment in gt:
        tree.add(Interval(segment[0], segment[1]))

    prediction_gt = []
    for segment in pred:
        overlap = tree.search(segment[0], segment[1])

        for label in overlap:
            if total_overlap == None:
                total_overlap = get_overlap(label, segment)
            else:
                total_overlap += get_overlap(label, segment)

        if len(overlap) != 0:
            prediction_gt.append(1)
        else:
            prediction_gt.append(0)

    total_groundtruth = _get_sum(gt)

    result = {}
    result['prediction_gt'] = prediction_gt
    result['recall'] = recall
    result['precision'] = np.mean(prediction_gt)

    result['overlap'] = total_overlap
    result['missed'] = total_groundtruth - total_overlap

    return result
Exemple #33
0
    def _vj_handshakes(self):
        handshakes = []
        just_v = Counter(self.just_v)
        just_j = Counter(self.just_j)

        itree = IntervalTree()

        just_v_keys = map(lambda x: x[0], sorted(just_v.items(), key=lambda z:z[1], reverse=True))

        start = 0
        for v in just_v_keys:
            end = start + len(v) + 1
            itree.addi(start, end, v)
            start = end

        all_v_suf = "|".join(just_v_keys)
        stree = IgorSuffixTree(all_v_suf)

        for j, jj in just_j.items():
            overlap, index, terminal = stree.search_stree(j)
            if terminal and len(j[:overlap]) >= self._settings.overlapLen:
                overlapping_v = itree.search(index)

                common_chains = set(self.pSeq_read_map[list(overlapping_v)[0].data]["chain_type"].keys()) & set(self.pSeq_read_map[j]["chain_type"].keys())
                if common_chains:
                    v_t = []
                    j_t = []
                    chtype = {}
                    for key, ch in self.pSeq_read_map[list(overlapping_v)[0].data]["chain_type"].items():
                        if key in common_chains:
                            v_t.extend(map(getGeneType, ch))
                            if key not in chtype:
                                chtype[key] = []
                            chtype[key].extend(ch)
                    for key, ch in self.pSeq_read_map[j]["chain_type"].items():
                        if key in common_chains:
                            j_t.extend(map(getGeneType, ch))
                            if key not in chtype:
                                chtype[key] = []
                            chtype[key].extend(ch)
                    if len(j[overlap:]) > 0:
                        newly_born_cdr3 = list(overlapping_v)[0].data + j[overlap:]
                    else:
                        position_of_j_in_v = list(overlapping_v)[0].data.rfind(j)
                        newly_born_cdr3 = list(overlapping_v)[0].data[:position_of_j_in_v + len(j)]
                    if newly_born_cdr3 not in self.cdr3_dict:
                        self.cdr3_dict[newly_born_cdr3] = []
                    if list(overlapping_v)[0].data in self.just_v_dict:
                        self.cdr3_dict[newly_born_cdr3].extend(self.just_v_dict[list(overlapping_v)[0].data])
                    if j in self.just_j_dict:
                        self.cdr3_dict[newly_born_cdr3].extend(self.just_j_dict[j])
                    if list(overlapping_v)[0].data in self.just_v_dict:
                        del self.just_v_dict[list(overlapping_v)[0].data]
                    if j in self.just_j_dict:
                        del self.just_j_dict[j]
                    countV = just_v[list(overlapping_v)[0].data]
                    countJ = just_j[j]
                    countVJ = countV + countJ
                    for x in range(countVJ):
                        handshakes.append(newly_born_cdr3)
                    self.pSeq_read_map[newly_born_cdr3] = {"v": v_t, "j": j_t, "chain_type": chtype, "overlap": overlap}
        return handshakes
Exemple #34
0
class IntervalTreeSet(AbstractDataset):
    def __init__(self,filename):
        AbstractDataset.__init__(self,filename)
        self.backend=IntervalTree([])
        self.nodecount=0
    
        #reload
        self.intervals=None
        self.tmpcount=0
    
    def reload_start(self,defaults):
        self.tmpintervals=[]
        self.tmpcount=0
    
    def reload_line(self,line,defaults):
        
        value,data=self.create_default_datarecord(line, defaults)

        #TODO: how do we initialize default TTL from command line

        lower,upper=ip4range(value)
        lowerlong=ip2long(lower)
        upperlong=ip2long(upper)
        
        if defaults.maxrange4!=None and upperlong-lowerlong>defaults.maxrange4:
            logging.warn("MAXRANGE4 prohobits adding %s in %s"%(value,self.filename))
            return
        
        interval=Interval(lowerlong,upperlong)
        interval.data=data
        self.tmpintervals.append(interval)
        self.tmpcount+=1
        
    def reload_end(self,defaults):
        newtree=IntervalTree(self.tmpintervals)
        self.backend=newtree
        del newtree
        self.nodecount=self.tmpcount
        self.tmpintervals=None
        self.tmpcount=0
    
    def get_record_count(self):
        return self.nodecount

    def get(self,query):
        query=ipreverse(query)
        q=ip2long(query)
        res=self.backend.search(q)
        for r in res:
            try:
                if r.data['excluded']:
                    return None
            except KeyError:
                continue
        
        #no exclusions, return first match
        if len(res)>0:
            data=res[0].data
            if 'TXT' in data:
                data['TXT']=self.apply_txt_template(data['TXT'], query, data['A'], self.defaults)
            return data
                m4_records[m4_record.id] = m4_record

m4_lists = m4_records.keys()
for m4_id in m4_lists:
    m4_record = m4_records[m4_id]
    tree[m4_record.target_start:m4_record.target_end] = m4_id

overlap_dict = {}
for m4_id in m4_lists:
    m4_record = m4_records[m4_id]
    length = m4_record.query_len
    large = []
    medium = []
    small = []
    overlap_list = list(
        tree.search(m4_record.target_start, m4_record.target_end))
    # print overlap_list
    for overlap_rec in overlap_list:
        if overlap_rec.data != m4_id:
            x = range(m4_record.target_start, m4_record.target_end)
            y = range(overlap_rec.begin, overlap_rec.end)
            # print x
            # print y
            # print set(x) & set(y)
            ovelap_len = len(set(x) & set(y))
            overlap_frac = float(ovelap_len) / length
            if overlap_frac >= 0.5:
                large.append(overlap_rec.data)
            elif overlap_frac >= 0.25:
                medium.append(overlap_rec.data)
            elif overlap_frac > 0: