def test_remove_gaps(self): s1 = ''.join(stats.sample_wr(translate.AAs(), 100)) others = [''.join([mut(x, 0.2,0.1) for x in s1]) for i in range(9)] seqs = [s1]+others als = muscle.alignSequences(seqs) res = len(als) == len(seqs) for i in range(len(als)): self.assertTrue(als[i].replace("-",'') == seqs[i].replace("-",''))
def test002(): s1 = ''.join(stats.sample_wr(translate.AAs(), 100)) others = [''.join([mut(x, 0.2,0.1) for x in s1]) for i in range(9)] seqs = [s1]+others res = False try: als = muscle.alignSequences(seqs, exepath=os.path.expanduser("~/develop/muscle3.8.13/muscle")) except muscle.MuscleError, me: res = True
def test001(): s1 = ''.join(stats.sample_wr(translate.AAs(), 100)) others = [''.join([mut(x, 0.2,0.1) for x in s1]) for i in range(9)] seqs = [s1]+others als = muscle.alignSequences(seqs) res = len(als) == len(seqs) for i in range(len(als)): res = res and (als[i].replace("-",'') == seqs[i].replace("-",'')) return True
def test001(): s1 = ''.join(stats.sample_wr(translate.AAs(), 100)) others = [''.join([mut(x, 0.2, 0.1) for x in s1]) for i in range(9)] seqs = [s1] + others als = muscle.alignSequences(seqs) res = len(als) == len(seqs) for i in range(len(als)): res = res and (als[i].replace("-", '') == seqs[i].replace("-", '')) return True
def test_remove_gaps(self): s1 = ''.join(stats.sample_wr(translate.AAs(), 100)) others = [''.join([mut(x, 0.2, 0.1) for x in s1]) for i in range(9)] seqs = [s1] + others als = muscle.alignSequences(seqs) res = len(als) == len(seqs) for i in range(len(als)): self.assertTrue( als[i].replace("-", '') == seqs[i].replace("-", ''))
def test002(): s1 = ''.join(stats.sample_wr(translate.AAs(), 100)) others = [''.join([mut(x, 0.2, 0.1) for x in s1]) for i in range(9)] seqs = [s1] + others res = False try: als = muscle.alignSequences( seqs, exepath=os.path.expanduser("~/develop/muscle3.8.13/muscle")) except muscle.MuscleError, me: res = True
def test_gapped_index(self): s1 = ''.join(stats.sample_wr(translate.AAs(), 50)) # No gaps: pgap = 0.0 others = [''.join([mut(x, 0.2,0.0) for x in s1]) for i in range(9)] seqs = [s1]+others res = False try: als = muscle.alignSequences(seqs) #print als self.assertTrue(len(als) == len(seqs)) for (i, s) in enumerate(seqs): self.assertTrue(s == als[i].replace("-",'')) except muscle.MuscleError as me: self.assertTrue(False)
def test_gapped_index(self): s1 = ''.join(stats.sample_wr(translate.AAs(), 50)) # No gaps: pgap = 0.0 others = [''.join([mut(x, 0.2, 0.0) for x in s1]) for i in range(9)] seqs = [s1] + others res = False try: als = muscle.alignSequences(seqs) #print als self.assertTrue(len(als) == len(seqs)) for (i, s) in enumerate(seqs): self.assertTrue(s == als[i].replace("-", '')) except muscle.MuscleError as me: self.assertTrue(False)
def makeAlignments(ortho_dict, cdna_dicts, filter_fxn=default_filter_fxn, filter_data=None, alignment_print_fxn=default_alignment_print_fxn): alignment_dict = {} num_aligns = 0 #print cdna_dicts.keys() for orf in ortho_dict.keys(): ortho_orfs = ortho_dict[orf] #print orf, ortho_orfs seqs = {} for (spec, sorf) in ortho_orfs: try: genome = cdna_dicts[spec] seq = genome[sorf] # Translate and so on prot = translate.translate(seq) if prot: seqs[spec] = (sorf, prot) else: print("# protein", sorf, "did not translate") #print seq #print translate.translateRaw(seq) except KeyError as ke: print("#", ke, spec, sorf, orf) pass species = seqs.keys() if filter_fxn(orf, seqs, filter_data): #len(species) == len(genome_dicts.keys()): # Found as many orthologs as genomes prots = [seqs[key][1] for key in species] try: protal = muscle.alignSequences(prots, 16) hdrs = [(spec, seqs[spec][0]) for spec in species] alignment_dict[orf] = (len(protal), hdrs, protal) num_aligns += 1 alignment_print_fxn(num_aligns, prots, protal, hdrs, orf) except muscle.MuscleError as me: print("#", me) return alignment_dict
def secondField(h): f = None try: f = biofile.secondField(h) except: f = biofile.firstField(h) return f # Read input if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format(options.in_fname)) (headers, seqs) = biofile.readFASTA(file(options.in_fname, 'r')) #, key_fxn=biofile.secondField) if options.translate_sequences: seqs = [translate.translate(s) for s in seqs] if not options.dont_align_sequences: aligned_seqs = muscle.alignSequences(seqs) seqs = aligned_seqs zhs = [(h,s) for (h,s) in zip(headers,seqs) if not s is None] all_keys = [biofile.firstField(h) for (h,s) in zhs] (headers, seqs) = zip(*zhs) prot_dict = dict([(biofile.firstField(h), s) for (h,s) in zhs]) gene_orf_dict = dict([(secondField(h), biofile.firstField(h)) for h in headers]) orf_gene_dict = dict([(v,k) for (k,v) in gene_orf_dict.items()]) # Write output n_written = 0 data_outs.write("header\n") for orf in query_keys: n_written += 1 # Write out stopping time
if '_' in seq: store_seq = False if store_seq: new_header = '{sname} {hdr}'.format(sname=species_name, hdr=hdr) #named_headers[species_name] = new_header #named_seqs[species_name] = seq new_headers.append(new_header) new_seqs.append(seq) #(headers, seqs) = zip(*[(named_headers[x],named_seqs[x]) for x in sorted(named_seqs.keys())]) headers = new_headers seqs = new_seqs if options.align: #print os.path.expanduser('/cygdrive/f/develop/muscle3.8.31/muscle') aligned_seqs = muscle.alignSequences(seqs) #, exepath='~\\develop\\muscle3.8.31\\muscle') seqs = aligned_seqs # Write output n_written = 0 for (hdr, seq) in zip(headers,seqs): line = ">{hdr}\n{seq}\n".format(hdr=hdr, seq=seq) data_outs.write(line) n_written += 1 # Write out stopping time data_outs.write("# Run finished {}\n".format(util.timestamp()))
f = None try: f = biofile.secondField(h) except: f = biofile.firstField(h) return f # Read input if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format(options.in_fname)) (headers, seqs) = biofile.readFASTA(file(options.in_fname, "r")) # , key_fxn=biofile.secondField) if options.translate_sequences: seqs = [translate.translate(s) for s in seqs] if not options.dont_align_sequences: aligned_seqs = muscle.alignSequences(seqs) seqs = aligned_seqs zhs = [(h, s) for (h, s) in zip(headers, seqs) if not s is None] all_keys = [biofile.firstField(h) for (h, s) in zhs] (headers, seqs) = zip(*zhs) prot_dict = dict([(biofile.firstField(h), s) for (h, s) in zhs]) gene_orf_dict = dict([(secondField(h), biofile.firstField(h)) for h in headers]) orf_gene_dict = dict([(v, k) for (k, v) in gene_orf_dict.items()]) # Write output n_written = 0 data_outs.write("header\n") for orf in query_keys: n_written += 1 # Write out stopping time