def __init__(self, root, train=True, normalize=True, window=16384, translate=False, pitch_shift=0, jitter=0., stride=512): self.normalize = normalize self.window = window self.pitch_shift = pitch_shift self.jitter = jitter self.translate = translate self.stride = stride self.root = os.path.join(os.path.expanduser(root), 'whois/train_data_09222019/') if train: labelfile = os.path.join(self.root, 'train.tsv') else: labelfile = os.path.join(self.root, 'dev.tsv') fs = 44100. self.size = 0 self.data = dict() self.labels = dict() self._base_idx = dict() self._cumsize = dict() with open(labelfile) as f: for i, (wav, start, dur, loc, date, master) in enumerate(csv.reader(f, delimiter='\t')): if i == 0: continue if wav not in self.data.keys(): xfs, x = wavfile.read(os.path.join(self.root, 'wav', wav)) xp = np.arange((fs / xfs) * len(x), dtype=np.float32) x = np.interp((xfs / fs) * xp, np.arange(len(x), dtype=np.float32), x).astype(np.float32) self.data[wav] = x self.labels[wav] = IntervalTree() self._base_idx[wav] = self.size self._cumsize[self.size] = wav self.size += len(x) // self.stride if float(dur) < 0.1: continue self.labels[wav][int(float(start) * fs):int((float(start) + float(dur)) * fs)] = 1 self._sorted_base = sorted(self._cumsize.keys()) print('Loaded dataset with {} datapoints'.format(self.size))
def make_gaps_tree(in_file): # A dictionary to store an interval tree for each chromosome header. all_trees = dict() x = SeqReader(in_file) for header, sequence in x.parse_fasta(): # Remove the greater than sign and only get first token if delimited by spaces header = header[1:].split(' ')[0] all_trees[header] = IntervalTree() gap_sequence = GapSequence(sequence) all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()] for i in all_coordinates: all_trees[header][i[0]:i[1]] = i return all_trees
def get_unique_loci(intervals): grouped_intervals = defaultdict(list) for genome, seqid, start, end in intervals: grouped_intervals[(genome, seqid)].append(Interval(start, end)) unique_loci = list() for (genome, seqid), intvls in grouped_intervals.items(): itree = IntervalTree(intvls) itree.merge_overlaps() for intvl in itree: unique_loci.append((genome, seqid, intvl.begin, intvl.end)) return unique_loci
def make_intervaltree(df: pd.DataFrame) -> IntervalTree: intervals = [] if df.empty: raise Exception( "Error! Try to make intervaltree from empty dataframe.") for idx, entry in df.iterrows(): #if entry.response_size == entry.offset: # first operation start = entry['offset'] - entry['response_size'] if start == entry['offset']: print("Emtpy interval! .. skip!") continue intervals.append(Interval(start, entry['offset'], (entry['kind'], idx))) return IntervalTree(intervals)
def __init__(self, *args, **kwargs) -> None: """Initialize the EdgeCollection object.""" # initialize the base class super().__init__(*args, **kwargs) # indicator whether the network has multi-edges self._multiple: bool = kwargs.pop('multiedges', False) # initialize an intervaltree to save events self._events = IntervalTree() # class of objects self._default_class: Any = TemporalEdge
def find_clusters(index_tree): """ Define a root cluster for each smoothing maximum Args: index_tree (IntervalTree): data are lists of tuples of 4 elements (min or max, cds region, genomic position, smoothing score). Returns: clusters_tree (IntervalTree): data are dict of dict """ clusters_tree = IntervalTree() for interval in index_tree: clusters = defaultdict(dict) j = 0 indexes = interval.data # Iterate through all maximum and generate a cluster per maximum generator_maxs = (i for i in indexes if i[0] == 1) for maximum in generator_maxs: i = indexes.index(maximum) # Add maximum clusters[j]['max'] = (maximum[1], maximum[2], maximum[3]) # Add margins # if maximum not in first nor last position if i != 0 and i != len(indexes) - 1: # if no contiguous left max if indexes[i - 1][0] != 1: clusters[j]['left_m'] = (indexes[i - 1][1], indexes[i - 1][2], indexes[i - 1][3]) else: clusters[j]['left_m'] = (maximum[1], maximum[2], maximum[3]) # if no contiguous right max if indexes[i + 1][0] != 1: clusters[j]['right_m'] = (indexes[i + 1][1], indexes[i + 1][2], indexes[i + 1][3]) else: clusters[j]['right_m'] = (maximum[1], maximum[2], maximum[3]) # if first position elif i == 0: clusters[j]['left_m'] = (maximum[1], maximum[2], maximum[3]) clusters[j]['right_m'] = (indexes[i + 1][1], indexes[i + 1][2], indexes[i + 1][3]) # if last position else: clusters[j]['left_m'] = (indexes[i - 1][1], indexes[i - 1][2], indexes[i - 1][3]) clusters[j]['right_m'] = (maximum[1], maximum[2], maximum[3]) j += 1 clusters_tree.addi(interval[0], interval[1], clusters) return clusters_tree
def _augment(self, source, times=1, gain=-8): aug_samples = self._load_samples(source) tree = IntervalTree() aug_durs = self._map( aug_samples, lambda s: int(math.ceil(s.file.duration * 1000.0))) total_aug_dur = sum(aug_durs) position = 0 for i, sample in enumerate(aug_samples): duration = aug_durs[i] tree[position:position + duration] = sample position += duration def prepare_sample(s): s.write() return int(math.ceil(s.file.duration * 1000.0)) orig_durs = self._map(self.samples, prepare_sample) total_orig_dur = sum(orig_durs) positions = [] position = 0 for i, sample in enumerate(self.samples): duration = orig_durs[i] positions.append((position, sample)) position += duration def augment_sample(pos_sample): position, sample = pos_sample orig_seg = sample.read_audio_segment() orig_dur = len(orig_seg) aug_seg = AudioSegment.silent(duration=orig_dur) sub_pos = position for i in range(times): inters = tree[sub_pos:sub_pos + orig_dur] for inter in inters: seg = inter.data.read_audio_segment() offset = inter.begin - sub_pos if offset < 0: seg = seg[-offset:] offset = 0 aug_seg = aug_seg.overlay(seg, position=offset) sub_pos = (sub_pos + total_orig_dur) % total_aug_dur aug_seg = aug_seg + (orig_seg.dBFS - aug_seg.dBFS + gain) orig_seg = orig_seg.overlay(aug_seg) sample.write_audio_segment(orig_seg) self._map(positions, augment_sample) print('Augmented %d samples in buffer.' % len(self.samples))
def get_exon_locs(protfile=PROTFILE, chrom=CHROMOSOMES): """ :param gene_path: Path to directory of Ensembl genes :return: an IntervalTree of gene locations on the appropriate chromosome """ genelocs = {chromID: IntervalTree() for chromID in chrom} if not os.path.isfile(protfile): sys.stderr.write('Could not open ' + protfile + '\n') return genelocs x = gzip.open(protfile) if protfile.endswith('gz') else open(protfile) locstart = 'chromosome:' + GENOME_BUILD + ':' geneintervals = {} for l in x: if l.startswith('>'): loc = l[l.find(locstart) + len(locstart):].split()[0] chromID = loc.split(':')[0] chromID = ('' if chromID.startswith('chr') else 'chr') + chromID if chromID not in chrom: continue exons = loc.split(':')[1].replace('join(', '').replace('complement(', '').replace(')', '').split(',') exons = [tuple(sorted([int(a.split('..')[0]), int(a.split('..')[1])])) for a in exons] geneID = l[l.find('gene:') + 5:].split()[0] # e.g., ENSG00000000457.9 if chromID not in geneintervals: geneintervals[chromID] = {} if geneID not in geneintervals[chromID]: geneintervals[chromID][geneID] = set() # Add each POSITIVE exon SEPARATELY! for exon_start, exon_end in exons: # these are 1-indexed, but we need 0-indexed... if exon_start > -1 and exon_end > -1: geneintervals[chromID][geneID].add((exon_start, exon_end)) x.close() # Store all the information as interval trees: for chromID in geneintervals.keys(): for geneID in geneintervals[chromID].keys(): for exon_start, exon_end in geneintervals[chromID][geneID]: genelocs[chromID].addi(exon_start, exon_end + (1 if exon_end == exon_start else 0), geneID + '_' + str(exon_start) + '-' + str(exon_end)) return genelocs
def score(clusters_tree, regions, mutations_element): """ Score clusters with fraction of mutations formula and number of cluster's mutations Args: clusters_tree( IntervalTree): genomic regions are intervals, data are trimmed clusters (dict of dict) regions (IntervalTree): IntervalTree where intervals are genomic positions of an element mutations_element (int): number of mutations in the element Returns: score_clusters_tree (IntervalTree): genomic regions are intervals, data are scored clusters (dict of dict) """ score_clusters_tree = IntervalTree() root = m.sqrt(2) for interval in clusters_tree: clusters = interval.data.copy() for cluster, values in clusters.items(): score_ = 0 mutated_positions_d = defaultdict(int) # Get number of mutations on each mutated position for mutation in values['mutations']: mutated_positions_d[mutation.position] += 1 # Map mutated position and smoothing maximum to region for position, count in mutated_positions_d.items(): map_mut_pos = set() map_smo_max = set() if regions[position]: for i in regions[position]: map_mut_pos = i for i in regions[values['max'][1]]: map_smo_max = i # Calculate distance of position to smoothing maximum if map_mut_pos[0] == map_smo_max[0]: distance_to_max = abs(position - values['max'][1]) elif map_mut_pos[0] < map_smo_max[0]: distance_to_max = (map_mut_pos[1] - position) + (values['max'][1] - map_smo_max[0]) else: distance_to_max = (map_smo_max[1] - values['max'][1]) + (position - map_mut_pos[0]) # Calculate fraction of mutations numerator = (count / mutations_element) * 100 # Calculate cluster score denominator = m.pow(root, distance_to_max) score_ += (numerator / denominator) # Update clusters[cluster]['score'] = score_ * len(values['mutations']) score_clusters_tree.addi(interval[0], interval[1], clusters) return score_clusters_tree
def __init__(self, interval_tuples:Iterator[Tuple[Chrom,int,int,GeneName]]): '''interval_tuples is like [('22', 12321, 12345, 'APOL1'), ...]''' self._its: Dict[Chrom,IntervalTree] = {} gene_start_tuples_by_chrom: Dict[Chrom,List[Tuple[int,GeneName]]] = {} gene_end_tuples_by_chrom: Dict[Chrom,List[Tuple[int,GeneName]]] = {} for (chrom, pos_start, pos_end, gene_name) in interval_tuples: if chrom not in self._its: self._its[chrom] = IntervalTree() gene_start_tuples_by_chrom[chrom] = [] gene_end_tuples_by_chrom[chrom] = [] self._its[chrom].add(Interval(pos_start, pos_end, gene_name)) gene_start_tuples_by_chrom[chrom].append((pos_start, gene_name)) gene_end_tuples_by_chrom[chrom].append((pos_end, gene_name)) self._gene_starts = {chrom:BisectFinder(tuples) for chrom,tuples in gene_start_tuples_by_chrom.items()} self._gene_ends = {chrom:BisectFinder(tuples) for chrom,tuples in gene_end_tuples_by_chrom.items()}
def get_interval_cu(self, cu_id): # MEM LD mem_ld_cycle, mem_ld_interval = self.get_interval_cu_cond( cu_id, 'LIKE "%MEM LD%"') mem_ld_interval_tree = IntervalTree( Interval(*iv) for iv in mem_ld_interval) # MEM ST mem_st_cycle, mem_st_interval = self.get_interval_cu_cond( cu_id, 'LIKE "%MEM ST%"') mem_st_interval_tree = IntervalTree( Interval(*iv) for iv in mem_st_interval) # OTHER other_cycle, other_interval = self.get_interval_cu_cond( cu_id, 'NOT LIKE "%MEM LD%"') other_interval_tree = IntervalTree( Interval(*iv) for iv in other_interval) cycle = self.get_max('inst', 'start + length', ' WHERE cu=' + str(cu_id)) # print cycle, mem_cycle, other_cycle info = {} info['mem_ld'] = mem_ld_interval_tree info['mem_st'] = mem_st_interval_tree info['other'] = other_interval_tree info['cycle_all'] = cycle info['cycle_mem_ld'] = mem_ld_cycle info['cycle_mem_st'] = mem_st_cycle info['cycle_other'] = other_cycle return info
def generate_interval_tree(peak_properties): """Conctruct an interval tree containing the elution windows of the analytes. Args: peak_properties (dict): Description Returns: IntervalTree: Description """ tree = IntervalTree() for key, data in peak_properties.items(): start = data["scan_start_time"] end = start + data["peak_width"] tree[start:end] = key return tree
def scan_tree(intervals): """construct an interval tree using supplied genomic intervals, check all elements on the tree against iself and return any that hit 2 or more intervals (i.e. itself + 1 other)""" retlist = set() t = IntervalTree(Interval(*iv) for iv in intervals) for g in intervals: if len(t.overlap(g[0], g[1])) > 1: # print( t.overlap( g[0], g[1]) ) o = t.overlap(g[0], g[1]) for x in o: retlist.add(x.data) return retlist
def test_copy_cast(): t = trees['ivs1']() tcopy = IntervalTree(t) tcopy.verify() assert t == tcopy tlist = list(t) for iv in tlist: assert iv in t for iv in t: assert iv in tlist tset = set(t) assert tset == t.items()
def calc_current_cnv_lineage(self, start, end, cluster_num, phylogeny): lineage_clusters, _ = phylogeny.get_lineage(cluster_num) pat_intervals = self.paternal_tree.copy() pat_intervals.slice(start) pat_intervals.slice(end) pat_tree = IntervalTree() for i in pat_intervals.envelop(start, end): if i.data.cluster_num in lineage_clusters: pat_tree.add(i) pat_tree.split_overlaps() pat_tree.merge_overlaps(data_reducer=self.sum_levels) mat_intervals = self.maternal_tree.copy() mat_intervals.slice(start) mat_intervals.slice(end) mat_tree = IntervalTree() for i in mat_intervals.envelop(start, end): if i.data.cluster_num in lineage_clusters: mat_tree.add(i) mat_tree.split_overlaps() mat_tree.merge_overlaps(data_reducer=self.sum_levels) return pat_tree, mat_tree
def generate_phase_switching(self): phase_switches = {} for chrom, size in self.csize.items(): tree = IntervalTree() start = 1 correct_phase = True while start < size: interval_len = np.floor(np.random.exponential(1e6)) tree[start:start+interval_len] = correct_phase correct_phase = not correct_phase start += interval_len phase_switches[chrom] = tree return phase_switches
def read_targets(in_bed, targets): with open(in_bed, 'rt') as ifile: for line in ifile: fields = line.rstrip().split() if len(fields) < 3: continue chrom = fields[0][3:] if fields[0].startswith('chr') else fields[0] start = int( fields[1]) + 1 # bed encodes first chromosomal position as 0 stop = int( fields[2]) # bed stores open intervals, so no need to add 1 chrom_targets = targets.setdefault(chrom, IntervalTree()) chrom_targets.addi( start, stop + 1 ) # IntervalTree stores open end intevals, so we need to add 1 to stop.
def test_original_sequence(): t = IntervalTree() t.addi(17.89, 21.89) t.addi(11.53, 16.53) t.removei(11.53, 16.53) t.removei(17.89, 21.89) t.addi(-0.62, 4.38) t.addi(9.24, 14.24) t.addi(4.0, 9.0) t.removei(-0.62, 4.38) t.removei(9.24, 14.24) t.removei(4.0, 9.0) t.addi(12.86, 17.86) t.addi(16.65, 21.65) t.removei(12.86, 17.86)
def get_single_iv_tree(curr_path): log_info_msg("[get_single_iv_tree] enter") curr_tree = IntervalTree() retval = read_snapshot_bitmap(curr_path, add_by_lba_cb, curr_tree) if not retval: # log_err_msg("[get_single_iv_tree] read_snapshot_bitmap failed") xlogging.raise_and_logging_error( r'读取位图文件失败', r'[get_single_iv_tree] get read_snapshot_bitmap failed') return None count = len(curr_tree) log_dbg_msg("[get_single_iv_tree] count={}".format(count)) return curr_tree
def test_chop(): t = IntervalTree([Interval(0, 10)]) t.chop(3, 7) assert len(t) == 2 assert sorted(t)[0] == Interval(0, 3) assert sorted(t)[1] == Interval(7, 10) t = IntervalTree([Interval(0, 10)]) t.chop(0, 7) assert len(t) == 1 assert sorted(t)[0] == Interval(7, 10) t = IntervalTree([Interval(0, 10)]) t.chop(5, 10) assert len(t) == 1 assert sorted(t)[0] == Interval(0, 5) t = IntervalTree([Interval(0, 10)]) t.chop(-5, 15) assert len(t) == 0 t = IntervalTree([Interval(0, 10)]) t.chop(0, 10) assert len(t) == 0
def __init__(self, scheduler: Scheduler, name: str, id: int, resources_list: Resources = None, capacity_bytes: int = 0): super().__init__(scheduler, name, id, resources_list, resource_sharing=True) self.capacity = capacity_bytes self._job_allocations: Dict[JobId, Interval] = { } # job_id -> [(start, end, num_bytes)] self._interval_tree = IntervalTree()
def test_update(): t = IntervalTree() interval = Interval(0, 1) s = set([interval]) t.update(s) assert isinstance(t, IntervalTree) assert len(t) == 1 assert set(t).pop() == interval interval = Interval(2, 3) t.update([interval]) assert isinstance(t, IntervalTree) assert len(t) == 2 assert sorted(t)[1] == interval
def find_remaining( itrees: Mapping[str, IntervalTree], nstretches: Mapping[str, IntervalTree], scaffolds: Mapping[str, SeqRecord], ) -> None: for scaffold, seq in scaffolds.items(): contigs = itrees[scaffold] nstretch = nstretches[scaffold] # This is just to remove the data from the intervals. # Having data prevents them from being removed with difference. intervals = [Interval(i.begin, i.end) for i in contigs] intervals.extend(Interval(i.begin, i.end) for i in nstretch) covered = IntervalTree(intervals) # Strict=false means that adjacent but non-overlapping # will also be merged. covered.merge_overlaps(strict=False) remaining = IntervalTree([Interval(0, len(seq))]) | covered remaining.split_overlaps() remaining.difference_update(covered) itrees[scaffold].update(remaining) return
def mouse_gene_intervals(): df = read_gtf_as_dataframe(GENCODE_MM10_FILE) df = df[df.feature == 'gene' & df.feature_type == 'protein_coding'] print(len(df)) trees = {chromosome_strand: IntervalTree() for chromosome_strand in product(MOUSE_CHROMOSOMES, ['+', '-'])} for _, row in df.iterrows(): if row['end'] > row['start']: # end is included, start count at 0 instead of 1 trees[row['seqname'] + row['strand']][row['start'] - 1:row['end'] ] = (row['gene_id']) logging.info('Built mouse exon tree with {} nodes' .format(sum([len(tree) for tree in trees.values()]))) return trees
def create_trees(): """ Makes a dict of callables that create the trees named. """ pbar = ProgressBar(len(intervals.ivs)) print('Creating trees from interval lists...') trees = {} for name, ivs in intervals.ivs.items(): pbar() module = from_import('test.data', name) if hasattr(module, 'tree'): trees[name] = module.tree else: trees[name] = IntervalTree(ivs).copy return trees
def create_interval_tree(): rating_intervals = IntervalTree() rating_intervals[0:250] = '0:250' rating_intervals[250:500] = '251:500' rating_intervals[500:750] = '501:750' rating_intervals[750:1000] = '751:1000' rating_intervals[1000:1250] = '1001:1250' rating_intervals[1250:1500] = '1251:1500' rating_intervals[1500:1750] = '1501:1750' rating_intervals[1750:2000] = '1751:2000' rating_intervals[2000:2250] = '2001:2250' rating_intervals[2250:2500] = '2251:2500' rating_intervals[2500:4000] = '2501+' return rating_intervals
def load_coverage_df(exon_padding, tx_accession, samples): transcript = genes.load_transcripts()[tx_accession] tree = IntervalTree( [Interval(exon.begin, exon.end) for exon in transcript.exons]) ds = [ load_coverage(sample, transcript.chrom, tree, transcript) for sample in samples ] df_coverage = pd.concat( [ds[0]["chrom"], ds[0]["pos"], ds[0]["exon_no"]] + [d.iloc[:, 3] for d in ds], axis="columns", ) df_coverage.sort_values("pos", inplace=True) return df_coverage
def test_copy_cast(): t = IntervalTree.from_tuples(data.ivs1.data) tcopy = IntervalTree(t) tcopy.verify() assert t == tcopy tlist = list(t) for iv in tlist: assert iv in t for iv in t: assert iv in tlist tset = set(t) assert tset == t.items()
def _build_regions(self): self._tree = IntervalTree() for sect in [ s for s in self._elf.sections if (s.region and s.region.is_flash) ]: start = sect.start length = sect.length # Skip empty sections. if length == 0: continue sect.data # Go ahead and read the data from the file. self._tree.addi(start, start + length, sect) LOG.debug("created flash section [%x:%x] for section %s", start, start + length, sect.name)
def read(self, length, offset, fh): """ Read data from this GhostFile. :param length: :param offset: :param fh: :return: """ if offset >= self.__filesize or length == 0: return b'' data = b'' intervals = IntervalTree(self.__rewritten_intervals[offset:offset+length]) intervals.merge_overlaps() intervals.slice(offset) intervals.slice(offset + length) intervals = sorted(intervals[offset:offset+length]) assert offset < self.__filesize assert intervals[0].begin >= offset and intervals[-1].end <= offset + length if len(intervals) > 0 else True if len(intervals) == 0: return b'\x00' * min(length, self.__filesize - offset) assert len(intervals) > 0 # Used to fill any hole at the start of the read range end_prev_interval = offset # Read the data for interv in intervals: # Fill any hole before this interval data += b'\x00' * (interv.begin - end_prev_interval) os.lseek(fh, interv.begin, os.SEEK_SET) data += os.read(fh, interv.length()) end_prev_interval = interv.end # Fill any hole at the end of the read range data += b'\x00' * (offset + length - intervals[-1].end) if offset + length > self.__filesize: data = data[0:self.__filesize-offset] assert len(data) <= length assert offset + len(data) <= self.__filesize return data