def main(): (opts, args) = getoptions() # Load PWMs pssms = load_motifs(opts.pwm_dir, opts.pseudocount) if opts.testseq is not None: if opts.seqtype == 'RNA': seq = Seq(opts.testseq, IUPAC.IUPACUnambiguousRNA()).back_transcribe() seq.alphabet = IUPAC.IUPACUnambiguousDNA() else: seq = Seq(opts.testseq, IUPAC.IUPACUnambiguousDNA()) final = scan_all(pssms, seq, opts) print final.to_csv(sep="\t", index=False) else: # Scan in sequence print >> sys.stderr, "Scanning sequences ", tic = time.time() for seqrecord in SeqIO.parse(open(args[0]), "fasta"): seq = seqrecord.seq if opts.seqtype == "RNA": seq = seq.back_transcribe() seq.alphabet = IUPAC.IUPACUnambiguousDNA() final = scan_all(pssms, seq, opts) print final.to_csv(sep="\t", index=False) toc = time.time() print >> sys.stderr, "done in %0.2f seconds!" % (float(toc - tic))
def _guess_consensus_alphabet(self, ambiguous): """Pick an (ungapped) alphabet for an alignment consesus sequence (PRIVATE). This just looks at the sequences we have, checks their type, and returns as appropriate type which seems to make sense with the sequences we've got. """ # Start with the (un-gapped version of) the alignment alphabet a = Alphabet._get_base_alphabet(self.alignment._alphabet) # Now check its compatible with all the rest of the sequences for record in self.alignment: # Get the (un-gapped version of) the sequence's alphabet alt = Alphabet._get_base_alphabet(record.seq.alphabet) if not isinstance(alt, a.__class__): raise ValueError( "Alignment contains a sequence with an incompatible alphabet." ) # Check the ambiguous character we are going to use in the consensus # is in the alphabet's list of valid letters (if defined). if ( hasattr(a, "letters") and a.letters is not None and ambiguous not in a.letters ): # We'll need to pick a more generic alphabet... if isinstance(a, IUPAC.IUPACUnambiguousDNA): if ambiguous in IUPAC.IUPACUnambiguousDNA().letters: a = IUPAC.IUPACUnambiguousDNA() else: a = Alphabet.generic_dna elif isinstance(a, IUPAC.IUPACUnambiguousRNA): if ambiguous in IUPAC.IUPACUnambiguousRNA().letters: a = IUPAC.IUPACUnambiguousRNA() else: a = Alphabet.generic_rna elif isinstance(a, IUPAC.IUPACProtein): if ambiguous in IUPAC.ExtendedIUPACProtein().letters: a = IUPAC.ExtendedIUPACProtein() else: a = Alphabet.generic_protein else: a = Alphabet.single_letter_alphabet return a
def test_compute_background_1(self): target = ms.compute_background(self.fastas, IUPAC.IUPACUnambiguousRNA(), verbose=False) expected = {'A': 0.1944, 'C': 0.1388, 'U': 0.5277, 'G': 0.1388} for key,value in expected.items(): self.assertAlmostEqual(target[key], value, 3)
def main(): tic = time.time() args = getoptions() seq_type = _guess_seq_type(args) bg = None if args.testseq: testseq_stack = args.testseq.split(',')[::-1] # make a stack ## Sequence if seq_type in ['RNA', 'RNASS']: if args.testseq: seq_file = SeqRecord(Seq(testseq_stack.pop())) else: seq_file = args.fastafiles[0] if not args.testseq: bg = load_background(args.bg_seq, args.uniform_background, seq_file, IUPAC.IUPACUnambiguousRNA(), not args.bgonly) if not args.bgonly: pssm = load_motif(args.pfm_seq, args.pseudocount, IUPAC.IUPACUnambiguousRNA(), bg) seq_results = scan_main(seq_file, pssm, IUPAC.IUPACUnambiguousRNA(), bg, args) else: print(dict(bg)) sys.exit() ## Structure if seq_type in ['SS', 'RNASS']: if args.testseq: struct_file = SeqRecord(Seq(testseq_stack.pop())) elif seq_type == 'SS': struct_file = args.fastafiles[0] else: struct_file = args.fastafiles[1] if not args.testseq: bg = load_background(args.bg_struct, args.uniform_background, struct_file, ContextualSecondaryStructure(), not args.bgonly) if not args.bgonly: pssm = load_motif(args.pfm_struct, args.pseudocount, ContextualSecondaryStructure(), bg) struct_results = scan_main(struct_file, pssm, ContextualSecondaryStructure(), bg, args) else: print(dict(bg)) sys.exit() if seq_type == 'RNASS': combined_results = combine(seq_results, struct_results) combined_results.reset_index(drop=True) _add_match_id(combined_results) combined_results.to_csv(sys.stdout, sep="\t", index=False) elif seq_type == 'RNA': seq_results.reset_index(drop=True) _add_match_id(seq_results) seq_results.to_csv(sys.stdout, sep="\t", index=False) else: struct_results.reset_index(drop=True) _add_match_id(struct_results) struct_results.to_csv(sys.stdout, sep="\t", index=False) toc = time.time() runtime = float(toc - tic) if runtime > 60: eprint("Done in %0.4f minutes!" % (runtime / 60)) else: eprint("Done in %0.4f seconds!" % (runtime))
def test_preprocessSeq_6(self): '''Test preprocess_seq() on RNA alphabet''' seqrec = SeqRecord(Seq('KHIL', ContextualSecondaryStructure())) target = ms.preprocess_seq(seqrec, IUPAC.IUPACUnambiguousRNA()) expected = 'KHIL' self.assertEqual(str(target), expected)
def test_preprocessSeq_5(self): '''Test preprocess_seq() on RNA alphabet''' seqrec = SeqRecord(Seq('GAUUACA', SingleLetterAlphabet())) target = ms.preprocess_seq(seqrec, IUPAC.IUPACUnambiguousRNA()) expected = 'GAUUACA' self.assertEqual(str(target), expected)
print IUPAC.unambiguous_dna.letters # letras de bases de adn print IUPAC.unambiguous_rna.letters # letras de bases de arn print IUPAC.ambiguous_dna.letters # letras IUPAC de bases de adn print IUPAC.ExtendedIUPACProtein.letters # letras de todas las proteĆnas existentes print IUPAC.ExtendedIUPACDNA.letters # letras de todas las bases existentes from Bio.Seq import Seq seq = Seq('CCGGTT',IUPAC.unambiguous_dna) print seq seq=seq.transcribe() #must be DNA to transcribe to RNA print seq seq=seq.translate() #must be DNA to translate to protein print seq #tipo de dato secuencia seq=Seq('CCGGUU',IUPAC.IUPACUnambiguousRNA()) #constructor class IUPAC...RNA print seq print seq.back_transcribe() #must be RNA to backtranscribe to DNA seq=Seq('ATGGTCTTTCCAGACGCG',IUPAC.unambiguous_dna) print Seq.transcribe(seq) #as function, up is as method print seq[:5] #methods as string print len(seq) #seq[0]='C' #aren't mutables st=str(seq) #toString print st #tipo de dato secuencia editable from Bio.Seq import MutableSeq mut_seq=seq.tomutable() #convertirlo a tipo seq mutable