def _region_to_seqs(self, track, extend_up=0, extend_down=0): BUFSIZE = 10000 if isinstance(track, list): for name in track: chrom, coords = name.split(":") start, end = [int(c) for c in coords.split("-")] start += 1 start -= extend_up end += extend_down seq = self.get_seq(chrom, start, end) yield Sequence(name, seq.seq) else: with open(track) as fin: lines = fin.readlines(BUFSIZE) while lines: for line in lines: name = line.strip() chrom, coords = name.split(":") start, end = [int(c) for c in coords.split("-")] start += 1 start -= extend_up end += extend_down seq = self.get_seq(chrom, start, end) yield Sequence(name, seq.seq) lines = fin.readlines(BUFSIZE)
def _variant_to_sequence(variants): """ Convert `cyvcf2.Variant` objects to `pyfaidx.Seqeunce` objects for reference and variants. """ for v in variants: ref = Sequence(name=v.chrom, seq=v.ref, start=v.start, end=v.start + len(v.ref)) alt = Sequence(name=v.chrom, seq=v.alt, start=v.start, end=v.start + len(v.alt)) yield ref, alt
def test_interval_seq_builder_concat(interval_seq_builder): with pytest.raises(TypeError): interval_seq_builder.concat() sequence = Sequence(seq='CCCCATCGNN', start=10, end=20) interval_seq_builder.restore(sequence) assert interval_seq_builder.concat() == 'CCCCTAGCNN'
def interval_seq_builder(): return IntervalSeqBuilder([ Interval('chr1', 10, 13), Interval('chr1', 13, 14), Sequence(seq='TAGC', start=14, end=18), Interval('chr1', 18, 20) ])
def _bed_to_seqs(self, track, stranded=False, extend_up=0, extend_down=0): BUFSIZE = 10000 with open(track) as fin: lines = fin.readlines(BUFSIZE) while lines: for line in lines: if line.startswith("#") or line.startswith("track"): continue vals = line.strip().split("\t") try: start, end = int(vals[1]), int(vals[2]) except ValueError: raise rc = False if stranded: try: rc = vals[5] == "-" except IndexError: pass starts = [start] ends = [end] chrom = vals[0] # BED12 if len(vals) == 12: starts = [int(x) for x in vals[11].split(",")[:-1]] sizes = [int(x) for x in vals[10].split(",")[:-1]] starts = [start + x for x in starts] ends = [ start + size for start, size in zip(starts, sizes) ] name = "{}:{}-{}".format(chrom, start, end) try: name = " ".join((name, vals[3])) except Exception: pass starts = [start + 1 for start in starts] # extend if extend_up: if rc: ends[-1] += extend_up else: starts[0] -= extend_up if extend_down: if rc: starts[0] -= extend_down else: ends[-1] += extend_down intervals = zip(starts, ends) seq = self.get_spliced_seq(chrom, intervals, rc) yield Sequence(name, seq.seq) lines = fin.readlines(BUFSIZE)
def locate(args): kmers, fd, fo = args.kmer, args.db, args.out fg = args.fg db = Fasta(fd) # kseqs = kmers.split(',') kseqs2 = [Sequence(name='kmer',seq=kseq).reverse.complement.seq for kseq in kseqs] ptn = "|".join([ "("+k+")" for k in kseqs+kseqs2 ]) # seqs = [] if fg != '': fhg = open(fg, 'r') for line in fhg: line = line.rstrip("\n") if not line: continue gid = line.split()[0] if gid == 'gid': continue if gid not in db: continue seqs.append(gid) else: seqs = db.keys() fho = open(fo, 'w') fho.write('kmer\tsid\tstart\tend\tsrd\n') i = 1 for seqid in seqs: seq = db[seqid][0:].seq for m in re.finditer(ptn, seq): start, end = m.start()+1, m.end() srd = "+" if m.group(0) in kseqs else "-" fho.write(f"{m.group(0)}\t{seqid}\t{start}\t{end}\t{srd}\n") i += 1 fho.close()
def _variant_to_sequence(self, variants): """ Convert `cyvcf2.Variant` objects to `pyfaidx.Seqeunce` objects for reference and variants. """ for v in variants: ref = Sequence(name=v.CHROM, seq=v.REF, start=v.start, end=v.start + len(v.REF)) # TO DO: consider alternative alleles. alt = Sequence(name=v.CHROM, seq=v.ALT[0], start=v.start, end=v.start + len(v.ALT[0])) yield ref, alt
def _regions_to_seqs(self, track, extend_up=0, extend_down=0): if isinstance(track, list): for region in track: name = region.strip() seq = self._region_to_seq(name, extend_up, extend_down) yield Sequence(name, seq) else: with open(track) as fin: bufsize = 10000 lines = fin.readlines(bufsize) for region in lines: name = region.strip() seq = self._region_to_seq(name, extend_up, extend_down) yield Sequence(name, seq) # load more lines if needed lines += fin.readlines()
def test__split_overlapping(variant_seq_extractor): pair = (Sequence(seq='AAA', start=3, end=6), Sequence(seq='T', start=3, end=4)) splited_pairs = list(variant_seq_extractor._split_overlapping([pair], 5)) assert splited_pairs[0][0].seq == 'AA' assert splited_pairs[0][1].seq == 'T' assert splited_pairs[1][0].seq == 'A' assert splited_pairs[1][1].seq == '' pair = (Sequence(seq='TT', start=3, end=5), Sequence(seq='AAA', start=3, end=6)) splited_pairs = list(variant_seq_extractor._split_overlapping([pair], 4)) assert splited_pairs[0][0].seq == 'T' assert splited_pairs[0][1].seq == 'A' assert splited_pairs[1][0].seq == 'T' assert splited_pairs[1][1].seq == 'AA'
def _bed_to_seqs(self, track, stranded=False, extend_up=0, extend_down=0): bufsize = 10000 with open(track) as fin: lines = fin.readlines(bufsize) for line in lines: if line.startswith("#") or line.startswith("track"): continue vals = line.strip().split("\t") chrom, start, end = str(vals[0]), int(vals[1]), int(vals[2]) name = f"{chrom}:{start}-{end}" # there might be more... starts = [start] ends = [end] # BED4: add name column to name if len(vals) >= 4: name = " ".join((name, vals[3])) # BED5: check strandedness rc = False if stranded and len(vals) >= 6: rc = vals[5] == "-" # BED12: get all blocks if len(vals) >= 12: starts = [int(x) for x in vals[11].split(",")[:-1]] sizes = [int(x) for x in vals[10].split(",")[:-1]] starts = [start + x for x in starts] ends = [start + size for start, size in zip(starts, sizes)] # convert to 1-based counting starts = [start + 1 for start in starts] # extend if extend_up: if rc: ends[-1] += extend_up else: starts[0] -= extend_up if extend_down: if rc: starts[0] -= extend_down else: ends[-1] += extend_down intervals = zip(starts, ends) seq = self.get_spliced_seq(chrom, intervals, rc) yield Sequence(name, seq.seq) # load more lines if needed lines += fin.readlines(1)
def get_spliced_seq(self, name, intervals, rc=False): """Return a sequence by record name and list of intervals Interval list is an iterable of [start, end]. Coordinates are 0-based, end-exclusive. """ # Get sequence for all intervals chunks = [self.faidx.fetch(name, s, e) for s, e in intervals] start = chunks[0].start end = chunks[-1].end # reverce complement if rc: seq = "".join([(-chunk).seq for chunk in chunks[::-1]]) else: seq = "".join([chunk.seq for chunk in chunks]) return Sequence(name=name, seq=seq, start=start, end=end)
def regions_to_seqs(self, track, extend_up=0, extend_down=0): # if track is a file, loop over its lines lines = track if isinstance(track, str): lines = parse_file(track) for line in lines: name = line.split()[0] try: if bad_coords(name, track): continue seq = region_to_seq(self, name, extend_up, extend_down) except (ValueError, IndexError): msg = f"Skipping region that cannot be parsed: '{name}'" if isinstance(track, str): msg = f"Skipping region from '{os.path.basename(track)}' that cannot be parsed: '{name}'" logger.warning(msg) continue yield Sequence(name, seq)
def bed_to_seq(self, vals, stranded=False, extend_up=0, extend_down=0): chrom, start, end = str(vals[0]), int(vals[1]), int(vals[2]) name = f"{chrom}:{start}-{end}" # there might be more... starts = [start] ends = [end] # BED4: add name column to name if len(vals) >= 4: name = " ".join((name, vals[3])) # BED5: check strandedness rc = False if stranded and len(vals) >= 6: rc = vals[5] == "-" # BED12: get all blocks if len(vals) >= 12: starts = [int(x) for x in vals[11].split(",")[:-1]] sizes = [int(x) for x in vals[10].split(",")[:-1]] starts = [start + x for x in starts] ends = [start + size for start, size in zip(starts, sizes)] # convert to 1-based counting starts = [start + 1 for start in starts] # extend if extend_up: if rc: ends[-1] += extend_up else: starts[0] -= extend_up if extend_down: if rc: starts[0] -= extend_down else: ends[-1] += extend_down intervals = zip(starts, ends) seq = self.get_spliced_seq(chrom, intervals, rc).seq return Sequence(name, seq)
def test_interval_seq_builder_restore(interval_seq_builder): sequence = Sequence(seq='CCCCATCGTT', start=10, end=20) interval_seq_builder.restore(sequence) assert interval_seq_builder[0].seq == 'CCC' assert interval_seq_builder[1].seq == 'C' assert interval_seq_builder[2].seq == 'TAGC' assert interval_seq_builder[3].seq == 'TT' interval_seq_builder.append(Interval('chr1', 5, 10)) interval_seq_builder.restore(sequence) assert interval_seq_builder[4].seq == '' interval_seq_builder.append(Interval('chr1', 20, 25)) interval_seq_builder.restore(sequence) assert interval_seq_builder[5].seq == '' interval_seq_builder.append(Interval('chr1', 10, 5)) interval_seq_builder.restore(sequence) assert interval_seq_builder[6].seq == '' interval_seq_builder.append(Interval('chr1', 25, 20)) interval_seq_builder.restore(sequence) assert interval_seq_builder[7].seq == ''
def read_kmer(fk, nfea): kms = [] fhk = open(fk,'r') for line in fhk: line = line.rstrip("\n") if not line: continue i, opt, bin, epi, pval, fid, fname, kmers = line.split()[:8] if i == 'i': continue i = int(i) if nfea == 'top5' and i > 5: break if nfea == 'top10' and i > 10: break if nfea == 'top30' and i > 30: break if nfea == 'top50' and i > 50: break if nfea == 'top100' and i > 100: break if nfea == 'top200' and i > 200: break if nfea == 'top300' and i > 300: break if nfea == 'top500' and i > 500: break kseqs = kmers.split(',') kseqs2 = [Sequence(name='kmer',seq=kseq).reverse.complement.seq for kseq in kseqs] ptn = "|".join([ "("+k+")" for k in kseqs+kseqs2 ]) kms.append([fid,ptn,set(kseqs)]) fhk.close() return kms
from pyfaidx import Sequence, complement from nose.tools import assert_raises, raises seq = Sequence(name='gi|557361099|gb|KF435150.1|', seq='TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA', start=100, end=150) seq_invalid = Sequence(name='gi|557361099|gb|KF435150.1|', seq='TTGAAGATTTPGCATGCAGCAGGTGCGCAAGGTGAAATNTTCACTGTTAAA', start=100, end=150) comp_valid = 'TTGAAGATTTnGCATGCAGCAGGtgccaAGGTGAAATGTTNACTGTTAAA' comp_invalid = 'TTGAAGATTTnGCATGCAGCPQGtgccaAGGTGAAATGTTNACTGTTAAA' def test_negate(): assert str(-seq) == str(seq.complement[::-1]) def test_negate_metadata(): # Negate should affect __repr__ the same way as reverse and complement seq_neg = -seq assert seq_neg.__repr__() == seq.complement[::-1].__repr__() def test_seq_invalid(): assert_raises(ValueError, lambda: seq_invalid.complement) def test_integer_index(): assert seq[1].seq == 'T' def test_slice_index(): assert seq[0:10].seq == 'TTGAAGATTT' @raises(ValueError)
def test_check_coordinates(): x = Sequence(name='gi|557361099|gb|KF435150.1|', seq='TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA', start=100, end=110) x[:]
def _fetch(self, interval, istart, iend): # fetch interval, ignore strand seq = self.ref_seq_extractor.extract( Interval(interval.chrom, istart, iend)) seq = Sequence(name=interval.chrom, seq=seq, start=istart, end=iend) return seq
def _fetch(self, interval, istart, iend): seq = self.fasta.extract(Interval(interval.chrom, istart, iend)) seq = Sequence(name=interval.chrom, seq=seq, start=istart, end=iend) return seq