Example #1
0
 def test_idx(self):
     s = Segment(start_position=self.position1, end_position=self.position2)
     self.assertIsNone(s._idx)
     self.assertEqual(s.idx, "chr1:1-2")
     s = Segment(start_position=self.position1, end_position=self.position2, idx="idx")
     self.assertEqual(s.idx, "idx")
     s.idx = "idx2"
     self.assertEqual(s.idx, "idx2")
Example #2
0
 def test_creation(self):
     position3 = Position(chromosome="chr1", coordinate=1, strand=Strand.FORWARD)
     position4 = Position(chromosome="chr2", coordinate=2, strand=Strand.FORWARD)
     position5 = Position(chromosome="chr1", coordinate=0, strand=Strand.FORWARD)
     for pos in [position3, position4, position5]:
         with self.assertRaises(ValueError):
             Segment(start_position=self.position1, end_position=pos)
     Segment(start_position=self.position1, end_position=self.position2)
Example #3
0
def get_scnt_from_ginkgo_source(source,
                                sample_name,
                                dummy_clone="1",
                                separator="\t",
                                chr_strip=True):
    scnp = SegmentCopyNumberProfile()
    segments = []
    reader = csv.DictReader(source, delimiter=separator)
    for row in reader:
        chromosome = row[GINKGO_CHROMOSOME]
        if chr_strip:
            chromosome = strip_chr(chr_string=chromosome)
        start = int(row[GINKGO_START_POSITION])
        end = int(row[GINKGO_END_POSITION])
        try:
            cn = int(row[sample_name])
        except KeyError:
            raise IOError(
                "Could not obtain a segment copy value for sample {sample}. Make sure that --sample-name matches (including case) to the column header in the Ginkgo file"
            )
        segment = Segment.from_chromosome_coordinates(chromosome=chromosome,
                                                      start=start,
                                                      end=end)
        sid = segment.stable_id_non_hap
        segments.append(segment)
        scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=cn)
    scnt = {dummy_clone: scnp}
    return segments, scnt
Example #4
0
def get_segments_from_gff_file(file_name,
                               chr_strip=True,
                               chr_mapping=None,
                               chr_mapping_missing_strategy="keep"):
    result = []
    for record in gffutils.DataIterator(file_name):
        chr_name = record.chrom
        if chr_mapping is not None and chr_name not in chr_mapping and chr_mapping_missing_strategy == "skip":
            continue
        if chr_mapping is not None:
            chr_name = chr_mapping.get(chr_name, chr_name)
        if chr_strip:
            chr_name = strip_chr(chr_string=chr_name)
        extra = dict(record.attributes)
        new_extra = {}
        for key, value in extra.items():
            if isinstance(value, list) and len(value) == 1:
                value = value[0]
            new_extra[key] = value
        segment = Segment.from_chromosome_coordinates(chromosome=chr_name,
                                                      start=record.start,
                                                      end=record.end)
        segment.extra.update(new_extra)
        result.append(segment)
    return result
Example #5
0
def get_scnt_from_remixt_source(source, separator="\t", chr_strip=True):
    segments = []
    clone1_id = "1"
    clone2_id = "2"
    scnt = {
        clone1_id: SegmentCopyNumberProfile(),
        clone2_id: SegmentCopyNumberProfile()
    }
    reader = csv.DictReader(source, delimiter=separator)
    for row in reader:
        chromosome = row[REMIXT_CHROMOSOME]
        if chr_strip:
            chromosome = strip_chr(chr_string=chromosome)
        start_coordinate = int(row[REMIXT_START_POSITION])
        end_coordinate = int(row[REMIXT_END_POSITION]) - 1
        segment = Segment.from_chromosome_coordinates(chromosome=chromosome,
                                                      start=start_coordinate,
                                                      end=end_coordinate)
        segments.append(segment)
        sid = segment.stable_id_non_hap
        clone_1_cn_a = int(row[REMIXT_CLONE1_CN_A])
        clone_1_cn_b = int(row[REMIXT_CLONE1_CN_B])
        clone_2_cn_a = int(row[REMIXT_CLONE2_CN_A])
        clone_2_cn_b = int(row[REMIXT_CLONE2_CN_A])
        clone1_scnp = scnt[clone1_id]
        clone2_scnp = scnt[clone2_id]
        clone1_scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=clone_1_cn_a)
        clone1_scnp.set_cn_record(sid=sid, hap=Haplotype.B, cn=clone_1_cn_b)
        clone2_scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=clone_2_cn_a)
        clone2_scnp.set_cn_record(sid=sid, hap=Haplotype.B, cn=clone_2_cn_b)
    return segments, scnt
Example #6
0
def get_circa_adj_cnt(adjacencies, window_size=10000000, chr_sizes=None, element="breakend", adj_full_cnt=True):
    result = defaultdict(int)
    adjacencies_ids_by_chrs = defaultdict(set)
    adjacencies_by_ids = {adj.stable_id_non_phased: adj for adj in adjacencies}
    for adj in adjacencies_by_ids.values():
        adjacencies_ids_by_chrs[adj.position1.chromosome].add(adj.stable_id_non_phased)
        adjacencies_ids_by_chrs[adj.position2.chromosome].add(adj.stable_id_non_phased)
    adjacencies_by_chr = defaultdict(list)
    for chr_name in list(adjacencies_ids_by_chrs.keys()):
        sorted_chr_adjacencies = sorted([adjacencies_by_ids[aid] for aid in adjacencies_ids_by_chrs[chr_name]],
                                        key=lambda adj: (adj.position1.coordinate, adj.position2.coordinate))
        adjacencies_by_chr[chr_name] = sorted_chr_adjacencies
    windows_by_chr = defaultdict(list)
    if chr_sizes is None:
        chr_sizes = {}
    for chr_name in set(chr_sizes.keys()) | set(adjacencies_by_chr.keys()):
        start = 0
        default = adjacencies_by_chr[chr_name][-1].position2.coordinate if chr_name in adjacencies_by_chr else 0
        end = chr_sizes.get(chr_name, default)
        windows_boundaries = list(range(start, end, window_size))
        if windows_boundaries[-1] != end:
            windows_boundaries.append(end)
        for lb, rb in zip(windows_boundaries[:-1], windows_boundaries[1:]):
            segment = Segment.from_chromosome_coordinates(chromosome=chr_name, start=lb + 1, end=rb)
            windows_by_chr[chr_name].append(segment)
    # counted_entries = set()
    for chr_name in windows_by_chr.keys():
        chr_windows = iter(windows_by_chr[chr_name])
        current_window = next(chr_windows, None)
        elements = []
        if element == "breakend":
            for adj in adjacencies_by_chr[chr_name]:
                for p in [adj.position1, adj.position2]:
                    if p.chromosome == chr_name:
                        elements.append(p)
        elif element == "adj":
            for adj in adjacencies_by_chr[chr_name]:
                p1_in = adj.position1.chromosome == chr_name
                p2_in = adj.position2.chromosome == chr_name
                if adj_full_cnt:
                    cnt = p1_in and p2_in
                else:
                    cnt = p1_in or p2_in
                if cnt:
                    elements.append(adj)
        for entry in elements:
            if current_window is None:
                break
            # element_id = element.stable_id_non_hap if isinstance(element, Position) else element.stable_id_non_phased
            # if element_id in counted_entries:
            #     continue
            # counted_entries.add(element_id)
            if element_before_window(current_window, entry, adj_full_cnt=adj_full_cnt):
                continue
            elif element_in_window(current_window, entry, adj_full_cnt=adj_full_cnt):
                result[current_window] += 1
            else:  # after window
                current_window = next(chr_windows, None)
    return result
Example #7
0
    def setUp(self):
        self.p1 = Position(chromosome="1", coordinate=1, strand=Strand.REVERSE)
        self.p2 = Position(chromosome="1", coordinate=2, strand=Strand.FORWARD)
        self.p3 = Position(chromosome="1", coordinate=3, strand=Strand.REVERSE)
        self.p4 = Position(chromosome="1", coordinate=4, strand=Strand.FORWARD)
        self.p5 = Position(chromosome="1", coordinate=5, strand=Strand.REVERSE)
        self.p6 = Position(chromosome="1", coordinate=6, strand=Strand.FORWARD)

        self.p7 = Position(chromosome="2", coordinate=1, strand=Strand.REVERSE)
        self.p8 = Position(chromosome="2", coordinate=2, strand=Strand.FORWARD)
        self.p9 = Position(chromosome="2", coordinate=3, strand=Strand.REVERSE)
        self.p10 = Position(chromosome="2",
                            coordinate=4,
                            strand=Strand.FORWARD)
        self.p11 = Position(chromosome="2",
                            coordinate=5,
                            strand=Strand.REVERSE)
        self.p12 = Position(chromosome="2",
                            coordinate=6,
                            strand=Strand.FORWARD)

        self.s1 = Segment(start_position=self.p1, end_position=self.p2)
        self.s2 = Segment(start_position=self.p3, end_position=self.p4)
        self.s3 = Segment(start_position=self.p5, end_position=self.p6)

        self.s4 = Segment(start_position=self.p7, end_position=self.p8)
        self.s5 = Segment(start_position=self.p9, end_position=self.p10)
        self.s6 = Segment(start_position=self.p11, end_position=self.p12)
Example #8
0
def get_scnt_from_titan_source(source,
                               sample_name,
                               clone_ids,
                               separator="\t",
                               corrected_cn_fix="None",
                               chr_strip=True):
    scnt = {clone_id: SegmentCopyNumberProfile() for clone_id in clone_ids}
    segments = []
    reader = csv.DictReader(source, delimiter=separator)
    for row in reader:
        if row[TITAN_SAMPLE_NAME] != sample_name:
            continue
        chromosome = row[TITAN_CHROMOSOME]
        if chr_strip:
            chromosome = strip_chr(chr_string=chromosome)
        segment = Segment.from_chromosome_coordinates(
            chromosome=chromosome,
            start=int(row[TITAN_START_POSITION]),
            end=int(row[TITAN_END_POSITION]))
        sid = segment.stable_id_non_hap
        segments.append(segment)
        major_cn, minor_cn = int(row[TITAN_MAJOR_CN]), int(row[TITAN_MINOR_CN])
        if minor_cn > major_cn:
            minor_cn, major_cn = major_cn, minor_cn
        titan_clone_id = row[TITAN_CLONE_ID]
        corrected_cn = int(row[TITAN_CORRECTED_CN])
        for clone_id in clone_ids:
            scnp = scnt[clone_id]
            if titan_clone_id == clone_id:
                if major_cn + minor_cn != corrected_cn and corrected_cn_fix != "None":
                    diff = corrected_cn - major_cn - minor_cn
                    ###
                    # initialize as 0 when corrected_cn_fix strategy does not match any known, yet is not "None"
                    ###
                    major_cn_addition = 0
                    minor_cn_addition = 0
                    if corrected_cn_fix == "equal":
                        major_cn_addition = int(math.ceil(diff / 2))
                        minor_cn_addition = diff - major_cn_addition
                    elif corrected_cn_fix == "relative-dist":
                        relative_relation = minor_cn * 1.0 / major_cn
                        major_cn_addition = int(
                            math.ceil(diff / (1 + relative_relation)))
                        minor_cn_addition = diff - major_cn_addition
                    major_cn += major_cn_addition
                    minor_cn += minor_cn_addition
                scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=major_cn)
                scnp.set_cn_record(sid=sid, hap=Haplotype.B, cn=minor_cn)
            else:
                scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=1)
                scnp.set_cn_record(sid=sid, hap=Haplotype.B, cn=1)
    return segments, scnt
Example #9
0
def get_scnt_from_battenberg_source(source,
                                    sample_name,
                                    separator="\t",
                                    chr_strip=True):
    clone1_name = "1"
    clone2_name = "2"
    scnt = {
        clone1_name: SegmentCopyNumberProfile(),
        clone2_name: SegmentCopyNumberProfile()
    }
    segments = []
    reader = csv.DictReader(source, delimiter=separator)
    for row in reader:
        if BATTENBERG_SAMPLE_NAME in row and row[
                BATTENBERG_SAMPLE_NAME] != sample_name:
            continue
        start_coordinate = int(row[BATTENBERG_START_POSITION])
        end_coordinate = int(row[BATTENBERG_END_POSITION])
        chromosome = row[BATTENBERG_CHROMOSOME]
        if chr_strip:
            chromosome = strip_chr(chr_string=chromosome)
        segment = Segment.from_chromosome_coordinates(chromosome=chromosome,
                                                      start=start_coordinate,
                                                      end=end_coordinate)
        clone1_scnp = scnt[clone1_name]
        clone2_scnp = scnt[clone2_name]
        cn1a = battenberg_force_non_negativity(int(
            row[BATTENBERG_CLONE1_CN_A]))
        cn1b = battenberg_force_non_negativity(int(
            row[BATTENBERG_CLONE1_CN_B]))
        clone1_scnp.set_cn_record_for_segment(segment=segment,
                                              cn=cn1a,
                                              haplotype=Haplotype.A)
        clone1_scnp.set_cn_record_for_segment(segment=segment,
                                              cn=cn1b,
                                              haplotype=Haplotype.B)
        cn2a = battenberg_get_subclonal_cn(
            subclonal_cn_string=row[BATTENBERG_CLONE2_CN_A],
            clonal_cn_int=cn1a)
        cn2b = battenberg_get_subclonal_cn(
            subclonal_cn_string=row[BATTENBERG_CLONE2_CN_B],
            clonal_cn_int=cn1b)
        clone2_scnp.set_cn_record_for_segment(segment=segment,
                                              cn=cn2a,
                                              haplotype=Haplotype.A)
        clone2_scnp.set_cn_record_for_segment(segment=segment,
                                              cn=cn2b,
                                              haplotype=Haplotype.B)
        segments.append(segment)
    return segments, scnt
Example #10
0
def get_scnt_from_hatchet_source(source,
                                 sample_name,
                                 clone_ids,
                                 separator="\t",
                                 chr_strip=True):
    scnt = {clone_id: SegmentCopyNumberProfile() for clone_id in clone_ids}
    segments = []
    clone_id_mappings = {}
    for line_cnt, line in enumerate(source):
        line = line.strip()
        data = line.split(separator)
        clone_data = data[6:]
        if line_cnt == 0:
            total_clone_cnt = int(len(clone_data) / 2)
            candidates = [str(cnt) for cnt in range(1, total_clone_cnt + 1)]
            for position_cnt, candidate in enumerate(candidates):
                if candidate in clone_ids:
                    clone_id_mappings[candidate] = position_cnt
        clone_cn_strs = clone_data[::2]
        if line.startswith("#") or len(line) == 0:
            continue
        data_sample_name = data[3]
        if data_sample_name != sample_name:
            continue
        chromosome = data[0]
        if chr_strip:
            chromosome = strip_chr(chr_string=chromosome)
        start_coord = int(data[1])
        end_coord = int(data[2]) - 1
        segment = Segment.from_chromosome_coordinates(chromosome=chromosome,
                                                      start=start_coord,
                                                      end=end_coord)
        segments.append(segment)
        fid = segment.stable_id_non_hap
        for clone_id in clone_ids:
            cns_str = clone_cn_strs[clone_id_mappings[clone_id]]
            data = cns_str.split("|")
            cna = int(data[0])
            cnb = int(data[1])
            scnt[clone_id].set_cn_record(sid=fid, hap=Haplotype.A, cn=cna)
            scnt[clone_id].set_cn_record(sid=fid, hap=Haplotype.B, cn=cnb)
    return segments, scnt
Example #11
0
 def test_chromosome(self):
     s = Segment(start_position=self.position1, end_position=self.position2)
     self.assertEqual(s.chromosome, self.position1.chromosome)
     self.assertEqual(s.chromosome, self.position2.chromosome)
Example #12
0
 def test_str(self):
     s = Segment(start_position=self.position1, end_position=self.position2)
     self.assertEqual(str(s), "chr1:1-2")
     s.idx = "idx"
     self.assertEqual(str(s), "idx")
Example #13
0
def get_circa_segments_cna_fractions(segments,
                                     scnt,
                                     clone_id,
                                     window_size=10000000,
                                     chr_sizes=None,
                                     cna_type="ampl",
                                     haploid=False):
    intra_result = defaultdict(list)
    segments_by_chrs = defaultdict(list)
    for segment in segments:
        segments_by_chrs[segment.chromosome].append(segment)
    for chr_name in sorted(segments_by_chrs.keys()):
        segments_by_chrs[chr_name] = sorted(
            segments_by_chrs[chr_name],
            key=lambda s: (s.start_coordinate, s.end_coordinate))
    windows_by_chr = defaultdict(list)
    if chr_sizes is None:
        chr_sizes = {}
    for chr_name in set(chr_sizes.keys()) | set(segments_by_chrs.keys()):
        start = segments_by_chrs[chr_name][
            0].start_coordinate - 1 if chr_name in segments_by_chrs else 0
        e_default = segments_by_chrs[chr_name][
            -1].end_coordinate if chr_name in segments_by_chrs else 0
        end = chr_sizes.get(chr_name, e_default)
        windows_boundaries = list(range(start, end, window_size))
        if windows_boundaries[-1] != end:
            windows_boundaries.append(end)
        for lb, rb in zip(windows_boundaries[:-1], windows_boundaries[1:]):
            segment = Segment.from_chromosome_coordinates(chromosome=chr_name,
                                                          start=lb + 1,
                                                          end=rb)
            windows_by_chr[chr_name].append(segment)
    windows = []
    for ws in windows_by_chr.values():
        chr_name = ws[0].chromosome
        if chr_name in segments_by_chrs and segments_by_chrs[chr_name][
                0].start_coordinate > ws[0].start_coordinate:
            segments_by_chrs[chr_name][0].start_position.coordinate = ws[
                0].start_coordinate
        if chr_name in segments_by_chrs and segments_by_chrs[chr_name][
                -1].end_coordinate < ws[-1].end_coordinate:
            segments_by_chrs[chr_name][-1].end_position.coordinate = ws[
                -1].end_coordinate
        for w in ws:
            windows.append(w)
    positions = []
    for w in windows:
        if w.chromosome in segments_by_chrs:
            positions.append(w.start_position)
            positions.append(w.end_position)
    r_segments, r_scnt, _ = refined_scnt_with_adjacencies_and_telomeres(
        segments=segments, scnt=scnt, telomere_positions=positions)
    r_segments_by_chr = defaultdict(list)
    for s in r_segments:
        r_segments_by_chr[s.chromosome].append(s)
    for chr_name in windows_by_chr.keys():
        chr_windows = iter(windows_by_chr[chr_name])
        chr_segments = sorted(r_segments_by_chr[chr_name],
                              key=lambda s:
                              (s.start_coordinate, s.end_coordinate))
        current_window = next(chr_windows, None)
        for segment in chr_segments:
            if current_window is None:
                break
            if segment.start_coordinate < current_window.start_coordinate:
                continue
            elif segment_within_segment(inner_segment=segment,
                                        outer_segment=current_window):
                intra_result[current_window].append(segment)
            else:
                current_window = next(chr_windows, None)
    scnp: SegmentCopyNumberProfile = r_scnt[clone_id]
    result = {}
    for window, segments in intra_result.items():
        cna_fraction = 0
        for segment in segments:
            length_fraction = segment.length / window.length
            if haploid:
                cns = [
                    scnp.get_combined_cn(sid=segment.stable_id_non_hap,
                                         default=-1)
                ]
                base = 2
            else:
                cns = [
                    scnp.get_cn(sid=segment.stable_id_non_hap,
                                haplotype=Haplotype.A,
                                default=-1),
                    scnp.get_cn(sid=segment.stable_id_non_hap,
                                haplotype=Haplotype.B,
                                default=-1)
                ]
                base = 1
            if any(map(lambda e: e < 0, cns)):
                print("something wrong with segment {s} in window {w}".format(
                    s=str(segment), w=str(window)))
            amplified = any(map(lambda cn: cn > base, cns))
            deletions = any(map(lambda cn: cn < base, cns))
            if cna_type == "ampl" and amplified:
                cna_fraction += length_fraction
            elif cna_type == "del" and deletions:
                cna_fraction += length_fraction
        result[window] = cna_fraction
    return result