def test_contig_coords(self): '''contig_coords() should get the coords of all contigs in a sequence correctly''' test_seqs = [ fastn.Fasta('ID', 'ACGT'), fastn.Fasta('ID', 'NACGT'), fastn.Fasta('ID', 'NNACGT'), fastn.Fasta('ID', 'ACGTN'), fastn.Fasta('ID', 'ACGTNN'), fastn.Fasta('ID', 'NANNCGT'), fastn.Fasta('ID', 'ANNCGTNNAAAAA') ] correct_coords = [[genome_intervals.Interval(0, 3)], [genome_intervals.Interval(1, 4)], [genome_intervals.Interval(2, 5)], [genome_intervals.Interval(0, 3)], [genome_intervals.Interval(0, 3)], [ genome_intervals.Interval(1, 1), genome_intervals.Interval(4, 6) ], [ genome_intervals.Interval(0, 0), genome_intervals.Interval(3, 5), genome_intervals.Interval(8, 12) ]] for i in range(len(test_seqs)): gaps = test_seqs[i].contig_coords() self.assertListEqual(correct_coords[i], gaps)
def test_intersection(self): '''Intersection should either return None or the correct intersection''' a = genome_intervals.Interval(5, 10) b = genome_intervals.Interval(8, 15) c = genome_intervals.Interval(12, 20) self.assertEqual(a.intersection(c), None) self.assertEqual(a.intersection(b), genome_intervals.Interval(8, 10))
def test_length_sum_from_list(self): '''Test that total length of intervals is summed correctly''' a = [ genome_intervals.Interval(1, 2), genome_intervals.Interval(4, 5), genome_intervals.Interval(10, 19) ] self.assertEqual(14, genome_intervals.length_sum_from_list(a))
def test_init(self): '''Throw error if try to construct genome_interval from a non-int, or end<start''' with self.assertRaises(genome_intervals.Error): genome_intervals.Interval('a', 1) with self.assertRaises(genome_intervals.Error): genome_intervals.Interval(1, 'a') with self.assertRaises(genome_intervals.Error): genome_intervals.Interval('a', 'a') with self.assertRaises(genome_intervals.Error): genome_intervals.Interval(3, 2)
def test_comparisons(self): '''< and <= should work as expected''' self.assertTrue( genome_intervals.Interval(1, 2) < genome_intervals.Interval(2, 2)) self.assertTrue( genome_intervals.Interval(1, 2) <= genome_intervals.Interval(2, 2)) self.assertFalse( genome_intervals.Interval(2, 2) <= genome_intervals.Interval(1, 2)) self.assertFalse( genome_intervals.Interval(2, 2) < genome_intervals.Interval(1, 2)) self.assertFalse( genome_intervals.Interval(2, 2) < genome_intervals.Interval(2, 2))
def test_union_flll_gap(self): '''union_fill_gap() should ignore intersections and return the maximum range of coords''' a = genome_intervals.Interval(5, 10) b = genome_intervals.Interval(8, 15) c = genome_intervals.Interval(12, 20) d = genome_intervals.Interval(21, 22) self.assertEqual(a.union_fill_gap(c), genome_intervals.Interval(5, 20)) self.assertEqual(c.union_fill_gap(a), genome_intervals.Interval(5, 20)) self.assertEqual(a.union_fill_gap(b), genome_intervals.Interval(5, 15)) self.assertEqual(b.union_fill_gap(a), genome_intervals.Interval(5, 15)) self.assertEqual(c.union_fill_gap(d), genome_intervals.Interval(12, 22)) self.assertEqual(d.union_fill_gap(c), genome_intervals.Interval(12, 22))
def gaps(self, min_length=1): gaps = [] regex = re.compile('N+', re.IGNORECASE) for m in regex.finditer(self.seq): if m.span()[1] - m.span()[0] + 1 >= min_length: gaps.append( genome_intervals.Interval(m.span()[0], m.span()[1] - 1)) return gaps
def test_union(self): '''Union should either return None or the correct union''' a = genome_intervals.Interval(5, 10) b = genome_intervals.Interval(8, 15) c = genome_intervals.Interval(12, 20) d = genome_intervals.Interval(21, 22) self.assertEqual(a.union(c), None) self.assertEqual(c.union(a), None) self.assertEqual(a.union(b), genome_intervals.Interval(5, 15)) self.assertEqual(b.union(a), genome_intervals.Interval(5, 15)) self.assertEqual(c.union(d), genome_intervals.Interval(12, 22)) self.assertEqual(d.union(c), genome_intervals.Interval(12, 22))
def test_merge_overlapping_in_list(self): '''merge_overlapping_in_list() merges correctly''' a = [ genome_intervals.Interval(1, 2), genome_intervals.Interval(51, 60), genome_intervals.Interval(10, 20), genome_intervals.Interval(20, 30), genome_intervals.Interval(20, 30), genome_intervals.Interval(29, 50), genome_intervals.Interval(65, 70) ] b = [ genome_intervals.Interval(1, 2), genome_intervals.Interval(10, 60), genome_intervals.Interval(65, 70) ] genome_intervals.merge_overlapping_in_list(a) self.assertSequenceEqual(a, b)
def contig_coords(self): # contigs are the opposite of gaps, so work out the coords from the gap coords gaps = self.gaps() if len(gaps) == 0: return [genome_intervals.Interval(0, len(self) - 1)] coords = [0] for g in gaps: if g.start == 0: coords = [g.end + 1] else: coords += [g.start - 1, g.end + 1] if coords[-1] + 1 < len(self): coords.append(len(self) - 1) return [ genome_intervals.Interval(coords[i], coords[i + 1]) for i in range(0, len(coords) - 1, 2) ]
def get_nucmer_hits(coords_file): qry_hits = {} ref_hits = {} nucmer_reader = nucmer.file_reader(coords_file) for hit in nucmer_reader: # nucmer hits are 1-based. INside the script, use 0-based. start, end = sorted([hit.ref_start - 1, hit.ref_end - 1]) if hit.ref_name not in ref_hits: ref_hits[hit.ref_name] = [] ref_hits[hit.ref_name].append(genome_intervals.Interval(start, end)) start, end = sorted([hit.qry_start - 1, hit.qry_end - 1]) if hit.qry_name not in qry_hits: qry_hits[hit.qry_name] = [] qry_hits[hit.qry_name].append(genome_intervals.Interval(start, end)) for l in ref_hits.values(): genome_intervals.merge_overlapping_in_list(l) for l in qry_hits.values(): genome_intervals.merge_overlapping_in_list(l) return ref_hits, qry_hits
def file2regions(fname): regions = {} f = utils.open_file_read(fname) for line in f: if line.startswith('#'): continue (chr, start, end) = line.rstrip().split() if chr not in regions: regions[chr] = [] regions[chr].append(genome_intervals.Interval(start, end)) utils.close(f) return regions
def test_intersects(self): '''Intersection of two intervals should do the right thing''' a = genome_intervals.Interval(5, 10) no_intersect = [ genome_intervals.Interval(3, 4), genome_intervals.Interval(11, 20) ] intersect = [ genome_intervals.Interval(3, 5), genome_intervals.Interval(3, 6), genome_intervals.Interval(9, 12), genome_intervals.Interval(10, 12), genome_intervals.Interval(6, 7), genome_intervals.Interval(1, 20) ] for i in no_intersect: self.assertFalse(a.intersects(i), 'shouldn\'t intersect: ' + str(a) + ', ' + str(i)) for i in intersect: self.assertTrue(a.intersects(i), 'should intersect: ' + str(a) + ', ' + str(i))
def test_gaps(self): '''gaps() should find the gaps in a sequence correctly''' test_seqs = [ fastn.Fasta('ID', 'ACGT'), fastn.Fasta('ID', 'NACGT'), fastn.Fasta('ID', 'NACGTN'), fastn.Fasta('ID', 'ANNCGT'), fastn.Fasta('ID', 'NANNCGTNN') ] correct_gaps = [[], [genome_intervals.Interval(0, 0)], [ genome_intervals.Interval(0, 0), genome_intervals.Interval(5, 5) ], [genome_intervals.Interval(1, 2)], [ genome_intervals.Interval(0, 0), genome_intervals.Interval(2, 3), genome_intervals.Interval(7, 8) ]] for i in range(len(test_seqs)): gaps = test_seqs[i].gaps() self.assertListEqual(correct_gaps[i], gaps)
if not sam_record.is_forward_strand(): sam_record.cigar.reverse() hit_start = 1 hit_end = len(sam_record.seq) if sam_record.cigar.operations[0].operator == 'S': hit_start = sam_record.cigar.operations[0].number if sam_record.cigar.operations[-1].operator == 'S': hit_end = len(sam_record.seq) - sam_record.cigar.operations[-1].number if sam_record.id not in read_hit_coords: read_hit_coords[sam_record.id] = [] read_hit_coords[sam_record.id].append(genome_intervals.Interval(hit_start - 1, hit_end - 1)) external_progs.bwa_index_clean(bwa_index) os.unlink(bwa_sam) seq_reader = fastn.file_reader(options.reads_in) f_fa = utils.open_file_write(options.outprefix + '.fq') f_log = utils.open_file_write(options.outprefix + '.log') for seq in seq_reader: if seq.id not in read_hit_coords: print(seq, file=f_fa) print(seq.id, 'no hit', sep='\t', file=f_log) else: hits = read_hit_coords[seq.id]
d['step'] = int(step) return d delete_range = range2dic(options.delete_range) insert_range = range2dic(options.insert_range) # convert the -d regions into sequence name, start and end coords to_delete = {} if options.delete: for s in options.delete: id, coords = s.rsplit(':') start, end = [int(x) - 1 for x in coords.split('-')] if id not in to_delete: to_delete[id] = [] to_delete[id].append(genome_intervals.Interval(start, end)) to_insert = {} if options.insert: for s in options.insert: id, pos, bases = s.rsplit(':', 2) pos = int(pos) - 1 bases = int(bases) if id not in to_insert: to_insert[id] = [] to_insert[id].append((pos, bases)) assert len(to_delete) * len(to_insert) == 0 # merge overlapping regions to be deleted for l in to_delete.values():
def test_contains(self): '''Check that contains() works as expected''' a = genome_intervals.Interval(5, 10) not_contained = [ genome_intervals.Interval(1, 2), genome_intervals.Interval(4, 5), genome_intervals.Interval(4, 10), genome_intervals.Interval(4, 11), genome_intervals.Interval(5, 11), genome_intervals.Interval(1, 2), genome_intervals.Interval(9, 11), genome_intervals.Interval(10, 11), genome_intervals.Interval(11, 20) ] contained = [ genome_intervals.Interval(5, 5), genome_intervals.Interval(5, 10), genome_intervals.Interval(6, 7), genome_intervals.Interval(6, 10), genome_intervals.Interval(10, 10) ] for i in not_contained: self.assertFalse(a.contains(i), 'shouldn\'t contain: ' + str(a) + ', ' + str(i)) for i in contained: self.assertTrue(a.contains(i), 'should contain: ' + str(a) + ', ' + str(i))
def test_len(self): self.assertEqual(len(genome_intervals.Interval(1, 2)), 2) self.assertEqual(len(genome_intervals.Interval(1, 1)), 1) self.assertEqual(len(genome_intervals.Interval(10, 20)), 11)
# load hits into hash. key=ref_name, value=another hash with key=qry_name, value=list of hit positions in that ref seq nucmer_hits = {} contigs_to_print = {} nucmer_reader = nucmer.file_reader(nucmer_out_coords) for hit in nucmer_reader: if hit.ref_name not in nucmer_hits: nucmer_hits[hit.ref_name] = {} if hit.qry_name not in nucmer_hits[hit.ref_name]: nucmer_hits[hit.ref_name][hit.qry_name] = [] nucmer_hits[hit.ref_name][hit.qry_name].append( genome_intervals.Interval(min(hit.ref_start, hit.ref_end), max(hit.ref_start, hit.ref_end))) # merge all the overalpping hits for each list of hits corresponding to one contig for ref_name, d in nucmer_hits.items(): for qry_name, hits in d.items(): genome_intervals.merge_overlapping_in_list(hits) for hit in hits: if hit.end - hit.start + 1 >= options.min_seq_length: if ref_name not in contigs_to_print: contigs_to_print[ref_name] = [] contigs_to_print[ref_name].append(copy.copy(hit)) # remove any contigs that are completely contained in another contig for ref, l in contigs_to_print.items():
# get query sequence lengths and gap positions - add each gap coord to the # list of covered positions for each sequence for seq in seq_reader: assert seq.id not in seq_lengths seq_lengths[seq.id] = len(seq) covered_regions[seq.id] = seq.gaps() nucmer_reader = nucmer.file_reader(options.nucmer_coords) for hit in nucmer_reader: assert hit.qry_name in seq_lengths # gaps are stored with coords starting from zero. Nucmer starts at 1, so need to decrement the coords start, end = sorted([hit.qry_start - 1, hit.qry_end - 1]) covered_regions[hit.qry_name].append(genome_intervals.Interval(start, end)) # merge the covered regions for l in covered_regions.values(): genome_intervals.merge_overlapping_in_list(l) f = utils.open_file_write(options.outfile) # get the regions that are not covered for id, covered in covered_regions.items(): not_covered = [] if len(covered) == 0: not_covered = [[1, seq_lengths[id]]] else: if covered[0].start != 0:
def test_remove_contained_in_list(self): '''test_remove_contained_in_list removes the right elements of list''' a = [ genome_intervals.Interval(1, 2), genome_intervals.Interval(4, 4), genome_intervals.Interval(4, 5), genome_intervals.Interval(5, 6), genome_intervals.Interval(7, 9), genome_intervals.Interval(8, 10), genome_intervals.Interval(9, 11), genome_intervals.Interval(20, 25), genome_intervals.Interval(20, 24), genome_intervals.Interval(20, 26), genome_intervals.Interval(30, 38), genome_intervals.Interval(30, 37), genome_intervals.Interval(30, 36), genome_intervals.Interval(30, 35), genome_intervals.Interval(30, 35), genome_intervals.Interval(32, 33), genome_intervals.Interval(38, 50), genome_intervals.Interval(65, 70), genome_intervals.Interval(67, 70) ] b = [ genome_intervals.Interval(1, 2), genome_intervals.Interval(4, 5), genome_intervals.Interval(5, 6), genome_intervals.Interval(7, 9), genome_intervals.Interval(8, 10), genome_intervals.Interval(9, 11), genome_intervals.Interval(20, 26), genome_intervals.Interval(30, 38), genome_intervals.Interval(38, 50), genome_intervals.Interval(65, 70) ] genome_intervals.remove_contained_in_list(a) self.assertSequenceEqual(a, b)
def test_intersection(self): '''intersection() should correctly intersect two lists of intervals''' a = [ genome_intervals.Interval(1, 2), genome_intervals.Interval(10, 20), genome_intervals.Interval(51, 52), genome_intervals.Interval(54, 55), genome_intervals.Interval(57, 58) ] b = [ genome_intervals.Interval(5, 6), genome_intervals.Interval(9, 11), genome_intervals.Interval(13, 14), genome_intervals.Interval(17, 18), genome_intervals.Interval(20, 25), genome_intervals.Interval(50, 60) ] i = [ genome_intervals.Interval(10, 11), genome_intervals.Interval(13, 14), genome_intervals.Interval(17, 18), genome_intervals.Interval(20, 20), genome_intervals.Interval(51, 52), genome_intervals.Interval(54, 55), genome_intervals.Interval(57, 58) ] self.assertSequenceEqual(genome_intervals.intersection(a, b), i) self.assertSequenceEqual(genome_intervals.intersection(b, a), i)
'Makes a random genome with sequence lengths and names determined by an fai file. IMPORTANT: not really random, at the moment every base will be an A (or an N if --gaps_file used)', usage='%(prog)s [options] <fai file> <outfile>') parser.add_argument( '--gaps_file', help='File of gaps, each line in the form: "chr start end" (tab separated)' ) parser.add_argument('fai_file', help='Name of fai file') parser.add_argument('outfile', help='Name of output fasta file') options = parser.parse_args() gaps = {} if options.gaps_file: f = utils.open_file_read(options.gaps_file) for line in f: (id, start, end) = line.rstrip().split('\t') gap = genome_intervals.Interval(int(start) - 1, int(end) - 1) if id not in gaps: gaps[id] = [] gaps[id].append(gap) utils.close(f) f_in = utils.open_file_read(options.fai_file) f_out = utils.open_file_write(options.outfile) for line in f_in: a = line.rstrip().split() fa = fastn.Fasta(a[0], 'A' * int(a[1])) if fa.id in gaps: fa.seq = list(fa.seq) for gap in gaps[fa.id]: