def test_idx(self): s = Segment(start_position=self.position1, end_position=self.position2) self.assertIsNone(s._idx) self.assertEqual(s.idx, "chr1:1-2") s = Segment(start_position=self.position1, end_position=self.position2, idx="idx") self.assertEqual(s.idx, "idx") s.idx = "idx2" self.assertEqual(s.idx, "idx2")
def test_creation(self): position3 = Position(chromosome="chr1", coordinate=1, strand=Strand.FORWARD) position4 = Position(chromosome="chr2", coordinate=2, strand=Strand.FORWARD) position5 = Position(chromosome="chr1", coordinate=0, strand=Strand.FORWARD) for pos in [position3, position4, position5]: with self.assertRaises(ValueError): Segment(start_position=self.position1, end_position=pos) Segment(start_position=self.position1, end_position=self.position2)
def get_scnt_from_ginkgo_source(source, sample_name, dummy_clone="1", separator="\t", chr_strip=True): scnp = SegmentCopyNumberProfile() segments = [] reader = csv.DictReader(source, delimiter=separator) for row in reader: chromosome = row[GINKGO_CHROMOSOME] if chr_strip: chromosome = strip_chr(chr_string=chromosome) start = int(row[GINKGO_START_POSITION]) end = int(row[GINKGO_END_POSITION]) try: cn = int(row[sample_name]) except KeyError: raise IOError( "Could not obtain a segment copy value for sample {sample}. Make sure that --sample-name matches (including case) to the column header in the Ginkgo file" ) segment = Segment.from_chromosome_coordinates(chromosome=chromosome, start=start, end=end) sid = segment.stable_id_non_hap segments.append(segment) scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=cn) scnt = {dummy_clone: scnp} return segments, scnt
def get_segments_from_gff_file(file_name, chr_strip=True, chr_mapping=None, chr_mapping_missing_strategy="keep"): result = [] for record in gffutils.DataIterator(file_name): chr_name = record.chrom if chr_mapping is not None and chr_name not in chr_mapping and chr_mapping_missing_strategy == "skip": continue if chr_mapping is not None: chr_name = chr_mapping.get(chr_name, chr_name) if chr_strip: chr_name = strip_chr(chr_string=chr_name) extra = dict(record.attributes) new_extra = {} for key, value in extra.items(): if isinstance(value, list) and len(value) == 1: value = value[0] new_extra[key] = value segment = Segment.from_chromosome_coordinates(chromosome=chr_name, start=record.start, end=record.end) segment.extra.update(new_extra) result.append(segment) return result
def get_scnt_from_remixt_source(source, separator="\t", chr_strip=True): segments = [] clone1_id = "1" clone2_id = "2" scnt = { clone1_id: SegmentCopyNumberProfile(), clone2_id: SegmentCopyNumberProfile() } reader = csv.DictReader(source, delimiter=separator) for row in reader: chromosome = row[REMIXT_CHROMOSOME] if chr_strip: chromosome = strip_chr(chr_string=chromosome) start_coordinate = int(row[REMIXT_START_POSITION]) end_coordinate = int(row[REMIXT_END_POSITION]) - 1 segment = Segment.from_chromosome_coordinates(chromosome=chromosome, start=start_coordinate, end=end_coordinate) segments.append(segment) sid = segment.stable_id_non_hap clone_1_cn_a = int(row[REMIXT_CLONE1_CN_A]) clone_1_cn_b = int(row[REMIXT_CLONE1_CN_B]) clone_2_cn_a = int(row[REMIXT_CLONE2_CN_A]) clone_2_cn_b = int(row[REMIXT_CLONE2_CN_A]) clone1_scnp = scnt[clone1_id] clone2_scnp = scnt[clone2_id] clone1_scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=clone_1_cn_a) clone1_scnp.set_cn_record(sid=sid, hap=Haplotype.B, cn=clone_1_cn_b) clone2_scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=clone_2_cn_a) clone2_scnp.set_cn_record(sid=sid, hap=Haplotype.B, cn=clone_2_cn_b) return segments, scnt
def get_circa_adj_cnt(adjacencies, window_size=10000000, chr_sizes=None, element="breakend", adj_full_cnt=True): result = defaultdict(int) adjacencies_ids_by_chrs = defaultdict(set) adjacencies_by_ids = {adj.stable_id_non_phased: adj for adj in adjacencies} for adj in adjacencies_by_ids.values(): adjacencies_ids_by_chrs[adj.position1.chromosome].add(adj.stable_id_non_phased) adjacencies_ids_by_chrs[adj.position2.chromosome].add(adj.stable_id_non_phased) adjacencies_by_chr = defaultdict(list) for chr_name in list(adjacencies_ids_by_chrs.keys()): sorted_chr_adjacencies = sorted([adjacencies_by_ids[aid] for aid in adjacencies_ids_by_chrs[chr_name]], key=lambda adj: (adj.position1.coordinate, adj.position2.coordinate)) adjacencies_by_chr[chr_name] = sorted_chr_adjacencies windows_by_chr = defaultdict(list) if chr_sizes is None: chr_sizes = {} for chr_name in set(chr_sizes.keys()) | set(adjacencies_by_chr.keys()): start = 0 default = adjacencies_by_chr[chr_name][-1].position2.coordinate if chr_name in adjacencies_by_chr else 0 end = chr_sizes.get(chr_name, default) windows_boundaries = list(range(start, end, window_size)) if windows_boundaries[-1] != end: windows_boundaries.append(end) for lb, rb in zip(windows_boundaries[:-1], windows_boundaries[1:]): segment = Segment.from_chromosome_coordinates(chromosome=chr_name, start=lb + 1, end=rb) windows_by_chr[chr_name].append(segment) # counted_entries = set() for chr_name in windows_by_chr.keys(): chr_windows = iter(windows_by_chr[chr_name]) current_window = next(chr_windows, None) elements = [] if element == "breakend": for adj in adjacencies_by_chr[chr_name]: for p in [adj.position1, adj.position2]: if p.chromosome == chr_name: elements.append(p) elif element == "adj": for adj in adjacencies_by_chr[chr_name]: p1_in = adj.position1.chromosome == chr_name p2_in = adj.position2.chromosome == chr_name if adj_full_cnt: cnt = p1_in and p2_in else: cnt = p1_in or p2_in if cnt: elements.append(adj) for entry in elements: if current_window is None: break # element_id = element.stable_id_non_hap if isinstance(element, Position) else element.stable_id_non_phased # if element_id in counted_entries: # continue # counted_entries.add(element_id) if element_before_window(current_window, entry, adj_full_cnt=adj_full_cnt): continue elif element_in_window(current_window, entry, adj_full_cnt=adj_full_cnt): result[current_window] += 1 else: # after window current_window = next(chr_windows, None) return result
def setUp(self): self.p1 = Position(chromosome="1", coordinate=1, strand=Strand.REVERSE) self.p2 = Position(chromosome="1", coordinate=2, strand=Strand.FORWARD) self.p3 = Position(chromosome="1", coordinate=3, strand=Strand.REVERSE) self.p4 = Position(chromosome="1", coordinate=4, strand=Strand.FORWARD) self.p5 = Position(chromosome="1", coordinate=5, strand=Strand.REVERSE) self.p6 = Position(chromosome="1", coordinate=6, strand=Strand.FORWARD) self.p7 = Position(chromosome="2", coordinate=1, strand=Strand.REVERSE) self.p8 = Position(chromosome="2", coordinate=2, strand=Strand.FORWARD) self.p9 = Position(chromosome="2", coordinate=3, strand=Strand.REVERSE) self.p10 = Position(chromosome="2", coordinate=4, strand=Strand.FORWARD) self.p11 = Position(chromosome="2", coordinate=5, strand=Strand.REVERSE) self.p12 = Position(chromosome="2", coordinate=6, strand=Strand.FORWARD) self.s1 = Segment(start_position=self.p1, end_position=self.p2) self.s2 = Segment(start_position=self.p3, end_position=self.p4) self.s3 = Segment(start_position=self.p5, end_position=self.p6) self.s4 = Segment(start_position=self.p7, end_position=self.p8) self.s5 = Segment(start_position=self.p9, end_position=self.p10) self.s6 = Segment(start_position=self.p11, end_position=self.p12)
def get_scnt_from_titan_source(source, sample_name, clone_ids, separator="\t", corrected_cn_fix="None", chr_strip=True): scnt = {clone_id: SegmentCopyNumberProfile() for clone_id in clone_ids} segments = [] reader = csv.DictReader(source, delimiter=separator) for row in reader: if row[TITAN_SAMPLE_NAME] != sample_name: continue chromosome = row[TITAN_CHROMOSOME] if chr_strip: chromosome = strip_chr(chr_string=chromosome) segment = Segment.from_chromosome_coordinates( chromosome=chromosome, start=int(row[TITAN_START_POSITION]), end=int(row[TITAN_END_POSITION])) sid = segment.stable_id_non_hap segments.append(segment) major_cn, minor_cn = int(row[TITAN_MAJOR_CN]), int(row[TITAN_MINOR_CN]) if minor_cn > major_cn: minor_cn, major_cn = major_cn, minor_cn titan_clone_id = row[TITAN_CLONE_ID] corrected_cn = int(row[TITAN_CORRECTED_CN]) for clone_id in clone_ids: scnp = scnt[clone_id] if titan_clone_id == clone_id: if major_cn + minor_cn != corrected_cn and corrected_cn_fix != "None": diff = corrected_cn - major_cn - minor_cn ### # initialize as 0 when corrected_cn_fix strategy does not match any known, yet is not "None" ### major_cn_addition = 0 minor_cn_addition = 0 if corrected_cn_fix == "equal": major_cn_addition = int(math.ceil(diff / 2)) minor_cn_addition = diff - major_cn_addition elif corrected_cn_fix == "relative-dist": relative_relation = minor_cn * 1.0 / major_cn major_cn_addition = int( math.ceil(diff / (1 + relative_relation))) minor_cn_addition = diff - major_cn_addition major_cn += major_cn_addition minor_cn += minor_cn_addition scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=major_cn) scnp.set_cn_record(sid=sid, hap=Haplotype.B, cn=minor_cn) else: scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=1) scnp.set_cn_record(sid=sid, hap=Haplotype.B, cn=1) return segments, scnt
def get_scnt_from_battenberg_source(source, sample_name, separator="\t", chr_strip=True): clone1_name = "1" clone2_name = "2" scnt = { clone1_name: SegmentCopyNumberProfile(), clone2_name: SegmentCopyNumberProfile() } segments = [] reader = csv.DictReader(source, delimiter=separator) for row in reader: if BATTENBERG_SAMPLE_NAME in row and row[ BATTENBERG_SAMPLE_NAME] != sample_name: continue start_coordinate = int(row[BATTENBERG_START_POSITION]) end_coordinate = int(row[BATTENBERG_END_POSITION]) chromosome = row[BATTENBERG_CHROMOSOME] if chr_strip: chromosome = strip_chr(chr_string=chromosome) segment = Segment.from_chromosome_coordinates(chromosome=chromosome, start=start_coordinate, end=end_coordinate) clone1_scnp = scnt[clone1_name] clone2_scnp = scnt[clone2_name] cn1a = battenberg_force_non_negativity(int( row[BATTENBERG_CLONE1_CN_A])) cn1b = battenberg_force_non_negativity(int( row[BATTENBERG_CLONE1_CN_B])) clone1_scnp.set_cn_record_for_segment(segment=segment, cn=cn1a, haplotype=Haplotype.A) clone1_scnp.set_cn_record_for_segment(segment=segment, cn=cn1b, haplotype=Haplotype.B) cn2a = battenberg_get_subclonal_cn( subclonal_cn_string=row[BATTENBERG_CLONE2_CN_A], clonal_cn_int=cn1a) cn2b = battenberg_get_subclonal_cn( subclonal_cn_string=row[BATTENBERG_CLONE2_CN_B], clonal_cn_int=cn1b) clone2_scnp.set_cn_record_for_segment(segment=segment, cn=cn2a, haplotype=Haplotype.A) clone2_scnp.set_cn_record_for_segment(segment=segment, cn=cn2b, haplotype=Haplotype.B) segments.append(segment) return segments, scnt
def get_scnt_from_hatchet_source(source, sample_name, clone_ids, separator="\t", chr_strip=True): scnt = {clone_id: SegmentCopyNumberProfile() for clone_id in clone_ids} segments = [] clone_id_mappings = {} for line_cnt, line in enumerate(source): line = line.strip() data = line.split(separator) clone_data = data[6:] if line_cnt == 0: total_clone_cnt = int(len(clone_data) / 2) candidates = [str(cnt) for cnt in range(1, total_clone_cnt + 1)] for position_cnt, candidate in enumerate(candidates): if candidate in clone_ids: clone_id_mappings[candidate] = position_cnt clone_cn_strs = clone_data[::2] if line.startswith("#") or len(line) == 0: continue data_sample_name = data[3] if data_sample_name != sample_name: continue chromosome = data[0] if chr_strip: chromosome = strip_chr(chr_string=chromosome) start_coord = int(data[1]) end_coord = int(data[2]) - 1 segment = Segment.from_chromosome_coordinates(chromosome=chromosome, start=start_coord, end=end_coord) segments.append(segment) fid = segment.stable_id_non_hap for clone_id in clone_ids: cns_str = clone_cn_strs[clone_id_mappings[clone_id]] data = cns_str.split("|") cna = int(data[0]) cnb = int(data[1]) scnt[clone_id].set_cn_record(sid=fid, hap=Haplotype.A, cn=cna) scnt[clone_id].set_cn_record(sid=fid, hap=Haplotype.B, cn=cnb) return segments, scnt
def test_chromosome(self): s = Segment(start_position=self.position1, end_position=self.position2) self.assertEqual(s.chromosome, self.position1.chromosome) self.assertEqual(s.chromosome, self.position2.chromosome)
def test_str(self): s = Segment(start_position=self.position1, end_position=self.position2) self.assertEqual(str(s), "chr1:1-2") s.idx = "idx" self.assertEqual(str(s), "idx")
def get_circa_segments_cna_fractions(segments, scnt, clone_id, window_size=10000000, chr_sizes=None, cna_type="ampl", haploid=False): intra_result = defaultdict(list) segments_by_chrs = defaultdict(list) for segment in segments: segments_by_chrs[segment.chromosome].append(segment) for chr_name in sorted(segments_by_chrs.keys()): segments_by_chrs[chr_name] = sorted( segments_by_chrs[chr_name], key=lambda s: (s.start_coordinate, s.end_coordinate)) windows_by_chr = defaultdict(list) if chr_sizes is None: chr_sizes = {} for chr_name in set(chr_sizes.keys()) | set(segments_by_chrs.keys()): start = segments_by_chrs[chr_name][ 0].start_coordinate - 1 if chr_name in segments_by_chrs else 0 e_default = segments_by_chrs[chr_name][ -1].end_coordinate if chr_name in segments_by_chrs else 0 end = chr_sizes.get(chr_name, e_default) windows_boundaries = list(range(start, end, window_size)) if windows_boundaries[-1] != end: windows_boundaries.append(end) for lb, rb in zip(windows_boundaries[:-1], windows_boundaries[1:]): segment = Segment.from_chromosome_coordinates(chromosome=chr_name, start=lb + 1, end=rb) windows_by_chr[chr_name].append(segment) windows = [] for ws in windows_by_chr.values(): chr_name = ws[0].chromosome if chr_name in segments_by_chrs and segments_by_chrs[chr_name][ 0].start_coordinate > ws[0].start_coordinate: segments_by_chrs[chr_name][0].start_position.coordinate = ws[ 0].start_coordinate if chr_name in segments_by_chrs and segments_by_chrs[chr_name][ -1].end_coordinate < ws[-1].end_coordinate: segments_by_chrs[chr_name][-1].end_position.coordinate = ws[ -1].end_coordinate for w in ws: windows.append(w) positions = [] for w in windows: if w.chromosome in segments_by_chrs: positions.append(w.start_position) positions.append(w.end_position) r_segments, r_scnt, _ = refined_scnt_with_adjacencies_and_telomeres( segments=segments, scnt=scnt, telomere_positions=positions) r_segments_by_chr = defaultdict(list) for s in r_segments: r_segments_by_chr[s.chromosome].append(s) for chr_name in windows_by_chr.keys(): chr_windows = iter(windows_by_chr[chr_name]) chr_segments = sorted(r_segments_by_chr[chr_name], key=lambda s: (s.start_coordinate, s.end_coordinate)) current_window = next(chr_windows, None) for segment in chr_segments: if current_window is None: break if segment.start_coordinate < current_window.start_coordinate: continue elif segment_within_segment(inner_segment=segment, outer_segment=current_window): intra_result[current_window].append(segment) else: current_window = next(chr_windows, None) scnp: SegmentCopyNumberProfile = r_scnt[clone_id] result = {} for window, segments in intra_result.items(): cna_fraction = 0 for segment in segments: length_fraction = segment.length / window.length if haploid: cns = [ scnp.get_combined_cn(sid=segment.stable_id_non_hap, default=-1) ] base = 2 else: cns = [ scnp.get_cn(sid=segment.stable_id_non_hap, haplotype=Haplotype.A, default=-1), scnp.get_cn(sid=segment.stable_id_non_hap, haplotype=Haplotype.B, default=-1) ] base = 1 if any(map(lambda e: e < 0, cns)): print("something wrong with segment {s} in window {w}".format( s=str(segment), w=str(window))) amplified = any(map(lambda cn: cn > base, cns)) deletions = any(map(lambda cn: cn < base, cns)) if cna_type == "ampl" and amplified: cna_fraction += length_fraction elif cna_type == "del" and deletions: cna_fraction += length_fraction result[window] = cna_fraction return result