def test_sanitize_all_hyphen(self): all_hyphens = "---" new_string = strings.sanitize(all_hyphens) self.assertEqual(new_string, "")
# Load sequence data into a data structure for internal use seqdict = {} files.build_seqdict(args.infile,seqdict) rna_string = str(args.RNA) gen_string = str(args.genomic) # Sequences must be in upper-case for k in seqdict.keys(): if re.search(rna_string,k): rna_seq = seqdict.get(k).upper() elif re.search(gen_string,k): gen_seq = seqdict.get(k).upper() # We directly compare aligned sequences, but class implementation uses # unaligned sequences (i.e. no gap characters '-') san_rna_seq = strings.sanitize(rna_seq) san_gen_seq = strings.sanitize(gen_seq) seq_pair = classes.SeqPair(san_rna_seq,san_gen_seq,name) # Find beginning and end of aligned region i = 0 j = 0 try: # Compare genomic and RNA sequences to find local regions of good # similarity, this is taken as the start and end of aligned region while not sequence.compare_nuc_seqs(gen_seq[i], rna_seq[i]): # If we find residues in either sequence, we need to increment # certain class values accordingly if gen_seq[i] != '-': seq_pair.incr_all() if rna_seq[i] != '-':
def test_sanitize(self): really_dirty_string = "This-strings-is-really-dirty" new_string = strings.sanitize(really_dirty_string) self.assertEqual(new_string, "Thisstringsisreallydirty")
# Load sequence data into a data structure for internal use seqdict = {} files.build_seqdict(args.infile,seqdict) rna_string = str(args.RNA) gen_string = str(args.genomic) # Sequences must be in upper-case for k in seqdict.keys(): if re.search(rna_string,k): rna_seq = seqdict.get(k).upper() elif re.search(gen_string,k): gen_seq = seqdict.get(k).upper() # We directly compare aligned sequences, but class implementation uses # unaligned sequences (i.e. no gap characters '-') san_rna_seq = strings.sanitize(rna_seq) san_gen_seq = strings.sanitize(gen_seq) seq_pair = classes.SeqPair(san_rna_seq,san_gen_seq,name) # Find beginning and end of aligned region i = 0 j = 0 try: # Compare genomic and RNA sequences to find local regions of good # similarity, this is taken as the start and end of aligned region while not sequence.compare_seqs((strings.gulp(rna_seq, i, size)), (strings.gulp(gen_seq, i, size)), num_equal): # If we find residues in either sequence, we need to increment # certain class values accordingly if gen_seq[i] != '-': seq_pair.incr_all()
gen_string = str(args.genomic) # Sequences must be in upper case for k in seqdict.keys(): if re.search(rna_string,k): # Since we are writing these data back out again # we want to keep track of sequence headers rna_header = k rna_seq = seqdict.get(k).upper() elif re.search(gen_string,k): gen_header = k gen_seq = seqdict.get(k).upper() else: ref_header = k ref_seq = seqdict.get(k).upper() san_gen_seq = strings.sanitize(gen_seq) san_ref_seq = strings.sanitize(ref_seq) # Steal the RefPair class, but we do not care about the name for # writing to output, use "name" as a placeholder ref_pair = classes.RefPair(san_ref_seq,san_gen_seq,"name") gen_start = 'NA' # Can't use False, as zero index also evaluates ref_start = 'NA' gen_list = [] ref_list = [] for i, (rg,rf) in enumerate(zip(gen_seq,ref_seq)): # A gap in both genomic and reference sequences is unlikely, # but we should account for it just in case if rf == '-' and rg == '-': #print "gap in both. Passing" pass
gen_string = str(args.genomic) # Sequences must be in upper case for k in seqdict.keys(): if re.search(rna_string,k): # Since we are writing these data back out again # we want to keep track of sequence headers rna_header = k rna_seq = seqdict.get(k).upper() elif re.search(gen_string,k): gen_header = k gen_seq = seqdict.get(k).upper() else: ref_header = k ref_seq = seqdict.get(k).upper() san_gen_seq = strings.sanitize(gen_seq) san_ref_seq = strings.sanitize(ref_seq) # Steal the RefPair class, but we do not care about the name for # writing to output, use "name" as a placeholder ref_pair = classes.RefPair(san_ref_seq,san_gen_seq,"name") gen_start = 'NA' # Can't use False, as zero index also evaluates ref_start = 'NA' gen_list = [] ref_list = [] for i, (rg,rf) in enumerate(zip(gen_seq,ref_seq)): #print "position: %d" % (i+1) #print "reference nucleotide: %s" % (rf) #print "reference codon position: %d" % (ref_pair.index_rposition()) #print "genomic nucleotide is: %s" % (rg) #print "genomic codon position: %d" % (ref_pair.index_gposition())