def countMotifs(infile, motifs): '''find regular expression *motifs* in sequences within fasta formatted *infile*. ''' it = FastaIterator.FastaIterator(infile) positions = [] while 1: try: seq = next(it) except StopIteration: break if not seq: break rseq = Genomics.reverse_complement(seq.sequence) lsequence = len(seq.sequence) pos = [] for motif, pattern in motifs: for x in pattern.finditer(seq.sequence): pos.append((motif, "+", x.start(), x.end())) for x in pattern.finditer(rseq): pos.append( (motif, "-", lsequence - x.end(), lsequence - x.start())) positions.append((seq.title, pos)) return positions
def updateVariants(variants, lcontig, strand, phased=True): '''update variants such that they use same coordinate system (and strand) as the transcript fixes 1-ness of variants ''' new_variants = [] is_positive = Genomics.IsPositiveStrand(strand) for variant in variants: pos = variant.pos genotype = bytes(variant.genotype) reference = bytes(variant.reference) # fix 1-ness of variants # pos -= 1 if len(genotype) == 1: variantseqs = list(Genomics.decodeGenotype(genotype)) has_wildtype = reference in variantseqs action = "=" start, end = pos, pos + 1 else: variantseqs = [x[1:] for x in genotype.split("/")] lvariant = max([len(x) for x in variantseqs]) if not phased: variantseqs = [x for x in variantseqs if x] has_wildtype = "*" in genotype if "+" in genotype and "-" in genotype: # both insertion and deletion at position # the range is given by the deletion # see below for explanations if genotype.startswith("+"): action = ">" variantseqs[1] += "-" * (lvariant - len(variantseqs[1])) else: action = "<" variantseqs[0] += "-" * (lvariant - len(variantseqs[0])) start, end = pos + 1, pos + lvariant + 1 elif "-" in genotype: action = "-" # samtools: deletions are after the base denoted by snp.position # * <- deletion at 1 # 0 1 2 3 4 5 6 # - - # 6 5 4 3 2 1 0 # deletion of 2+3 = (2,4) # on reverse: (7-4, 7-2) = (3,5) start, end = pos + 1, pos + lvariant + 1 # deletions of unequal length are filled up with "-" # This is necessary to deal with negative strands: # -at/-atg on the positive strand deletes a t [g] # -at/-atg on the negative strand deletes [g] t a variantseqs = [ x + "-" * (lvariant - len(x)) for x in variantseqs ] elif "+" in genotype: action = "+" # indels are after the base denoted by position # as region use both flanking base so that negative strand # coordinates work # insertion between position 2 and 3 # * <- insection at pos 2 # 0 1 2i3 4 # 4 3 2i1 0 # is insertion between 1 and 2 in reverse # including both flanking residues makes it work: # (2,3) = (5-3,5-2) = (2,3) # but: # (2,4) = (5-4,5-2) = (1,3) start, end = pos, pos + 2 # revert strand if not is_positive: reference = Genomics.reverse_complement(reference) variantseqs = [ Genomics.reverse_complement(x.upper()) for x in variantseqs ] start, end = lcontig - end, lcontig - start new_variants.append( ExtendedVariant._make((start, end, reference.upper(), action, has_wildtype, variantseqs))) return new_variants
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=('join', ), help="method to apply [default=%default].") parser.set_defaults(method="join", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply at least two fastq files on the commandline") fn1, fn2 = args c = E.Counter() outfile = options.stdout if options.method == "join": # merge based on diagonals in dotplot iter1 = Fastq.iterate(iotools.open_file(fn1)) iter2 = Fastq.iterate(iotools.open_file(fn2)) tuple_size = 2 for left, right in zip(iter1, iter2): c.input += 1 # build dictionary of tuples s1, q1 = left.seq, left.quals d = collections.defaultdict(list) for x in range(len(s1) - tuple_size): d[s1[x:x + tuple_size]].append(x) s2, q2 = right.seq, right.quals s2 = Genomics.reverse_complement(s2) q2 = q2[::-1] # compute list of offsets/diagonals offsets = collections.defaultdict(int) for x in range(len(s2) - tuple_size): c = s2[x:x + tuple_size] for y in d[c]: offsets[x - y] += 1 # find maximum diagonal sorted = sorted([(y, x) for x, y in list(offsets.items())]) max_count, max_offset = sorted[-1] E.debug('%s: maximum offset at %i' % (left.identifier, max_offset)) # simple merge sequence take = len(s2) - max_offset merged_seq = s1 + s2[take:] # simple merge quality scores merged_quals = q1 + q2[take:] new_entry = copy.copy(left) new_entry.seq = merged_seq new_entry.quals = merged_quals outfile.write(new_entry) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.stop()
"utron_size")) for utron in bedfile: ss5_sequence = genome.getSequence(utron.contig, "+", utron.start, utron.start+2) ss3_sequence = genome.getSequence(utron.contig, "+", utron.end-2, utron.end) if utron.strand == "+": splice_site_dict[utron.name] = (ss5_sequence, ss3_sequence) if ":" in utron.name: transcript_id = utron.name.split(":")[0] match_transcript_id = utron.name.split(":")[1] outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (transcript_id, utron.strand, ss5_sequence, ss3_sequence, utron.contig, utron.start, utron.end, utron.end-utron.start)) else: outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (utron.name, utron.strand, ss5_sequence, ss3_sequence, utron.contig, utron.start, utron.end, utron.end-utron.start)) elif utron.strand == "-": ss5_sequence = Genomics.reverse_complement(ss5_sequence) ss3_sequence = Genomics.reverse_complement(ss3_sequence) splice_site_dict[utron.name] = (ss3_sequence, ss5_sequence) if ":" in utron.name: transcript_id = utron.name.split(":")[0] match_transcript_id = utron.name.split(":")[1] outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (transcript_id, utron.strand, ss3_sequence, ss5_sequence, utron.contig, utron.end, utron.start, utron.end-utron.start)) else: outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (utron.name, utron.strand, ss3_sequence, ss5_sequence, utron.contig, utron.end, utron.start, utron.end-utron.start)) outfile.close() from collections import defaultdict counts = defaultdict(int) for name, ss in splice_site_dict.items():
def getSequence(self, contig, strand="+", start=0, end=0, converter=None, as_array=False): """get a genomic fragment. A genomic fragment is identified by the coordinates contig, strand, start, end. The converter function supplied translated these coordinates into 0-based coordinates. By default, start and end are assumed to be pythonic coordinates and are forward/reverse coordinates. If as_array is set to true, return the AString object. This might be beneficial for large sequence chunks. If as_array is set to False, return a python string. """ contig = self.getToken(contig) data = self.mIndex[contig] # dummy is # -> pos_seq for seekable streams # -> block_size for unseekable streams try: pos_id, dummy, lsequence = struct.unpack("QQi", data) except (struct.error, TypeError): pos_id, dummy, lsequence, points = data pos_seq = dummy block_size = dummy if end == 0: end = lsequence if end > lsequence: raise ValueError("3' coordinate on %s out of bounds: %i > %i" % (contig, end, lsequence)) if start < 0: raise ValueError("5' coordinate on %s out of bounds: %i < 0" % (contig, start)) if converter: first_pos, last_pos = converter(start, end, str(strand) in ("+", "1"), lsequence) elif self.mConverter: first_pos, last_pos = self.mConverter(start, end, str(strand) in ("+", "1"), lsequence) else: first_pos, last_pos = start, end if str(strand) in ("-", "0", "-1"): first_pos, last_pos = lsequence - \ last_pos, lsequence - first_pos if first_pos == last_pos: return "" assert first_pos < last_pos, \ "first position %i is larger than last position %i " % \ (first_pos, last_pos) p = AString() if self.mNoSeek: # read directly from position p.fromstring( self.mDatabaseFile.read(block_size, data[3], first_pos, last_pos)) else: first_pos += pos_seq last_pos += pos_seq self.mDatabaseFile.seek(first_pos) p.fromstring(self.mDatabaseFile.read(last_pos - first_pos)) if str(strand) in ("-", "0", "-1"): p = AString(Genomics.reverse_complement(str(p))) if self.mTranslator: return self.mTranslator.translate(p) elif as_array: return p else: return p.tostring().decode("ascii")