Ejemplo n.º 1
0
 def test_sanitize_all_hyphen(self):
     all_hyphens = "---"
     new_string = strings.sanitize(all_hyphens)
     self.assertEqual(new_string, "")
Ejemplo n.º 2
0
# Load sequence data into a data structure for internal use
seqdict = {}
files.build_seqdict(args.infile,seqdict)

rna_string = str(args.RNA)
gen_string = str(args.genomic)
# Sequences must be in upper-case
for k in seqdict.keys():
    if re.search(rna_string,k):
        rna_seq = seqdict.get(k).upper()
    elif re.search(gen_string,k):
        gen_seq = seqdict.get(k).upper()

# We directly compare aligned sequences, but class implementation uses
# unaligned sequences (i.e. no gap characters '-')
san_rna_seq = strings.sanitize(rna_seq)
san_gen_seq = strings.sanitize(gen_seq)
seq_pair = classes.SeqPair(san_rna_seq,san_gen_seq,name)

# Find beginning and end of aligned region
i = 0
j = 0
try:
    # Compare genomic and RNA sequences to find local regions of good
    # similarity, this is taken as the start and end of aligned region
    while not sequence.compare_nuc_seqs(gen_seq[i], rna_seq[i]):
        # If we find residues in either sequence, we need to increment
        # certain class values accordingly
        if gen_seq[i] != '-':
            seq_pair.incr_all()
        if rna_seq[i] != '-':
Ejemplo n.º 3
0
 def test_sanitize(self):
     really_dirty_string = "This-strings-is-really-dirty"
     new_string = strings.sanitize(really_dirty_string)
     self.assertEqual(new_string, "Thisstringsisreallydirty")
# Load sequence data into a data structure for internal use
seqdict = {}
files.build_seqdict(args.infile,seqdict)

rna_string = str(args.RNA)
gen_string = str(args.genomic)
# Sequences must be in upper-case
for k in seqdict.keys():
    if re.search(rna_string,k):
        rna_seq = seqdict.get(k).upper()
    elif re.search(gen_string,k):
        gen_seq = seqdict.get(k).upper()

# We directly compare aligned sequences, but class implementation uses
# unaligned sequences (i.e. no gap characters '-')
san_rna_seq = strings.sanitize(rna_seq)
san_gen_seq = strings.sanitize(gen_seq)
seq_pair = classes.SeqPair(san_rna_seq,san_gen_seq,name)

# Find beginning and end of aligned region
i = 0
j = 0
try:
    # Compare genomic and RNA sequences to find local regions of good
    # similarity, this is taken as the start and end of aligned region
    while not sequence.compare_seqs((strings.gulp(rna_seq, i, size)),
            (strings.gulp(gen_seq, i, size)), num_equal):
        # If we find residues in either sequence, we need to increment
        # certain class values accordingly
        if gen_seq[i] != '-':
            seq_pair.incr_all()
Ejemplo n.º 5
0
 def test_sanitize_all_hyphen(self):
     all_hyphens = "---"
     new_string = strings.sanitize(all_hyphens)
     self.assertEqual(new_string, "")
Ejemplo n.º 6
0
 def test_sanitize(self):
     really_dirty_string = "This-strings-is-really-dirty"
     new_string = strings.sanitize(really_dirty_string)
     self.assertEqual(new_string, "Thisstringsisreallydirty")
Ejemplo n.º 7
0
    gen_string = str(args.genomic)
    # Sequences must be in upper case
    for k in seqdict.keys():
        if re.search(rna_string,k):
            # Since we are writing these data back out again
            # we want to keep track of sequence headers
            rna_header = k
            rna_seq = seqdict.get(k).upper()
        elif re.search(gen_string,k):
            gen_header = k
            gen_seq = seqdict.get(k).upper()
        else:
            ref_header = k
            ref_seq = seqdict.get(k).upper()

    san_gen_seq = strings.sanitize(gen_seq)
    san_ref_seq = strings.sanitize(ref_seq)
    # Steal the RefPair class, but we do not care about the name for
    # writing to output, use "name" as a placeholder
    ref_pair = classes.RefPair(san_ref_seq,san_gen_seq,"name")

    gen_start = 'NA' # Can't use False, as zero index also evaluates
    ref_start = 'NA'
    gen_list = []
    ref_list = []
    for i, (rg,rf) in enumerate(zip(gen_seq,ref_seq)):
        # A gap in both genomic and reference sequences is unlikely,
        # but we should account for it just in case
        if rf == '-' and rg == '-':
            #print "gap in both. Passing"
            pass
Ejemplo n.º 8
0
    gen_string = str(args.genomic)
    # Sequences must be in upper case
    for k in seqdict.keys():
        if re.search(rna_string,k):
            # Since we are writing these data back out again
            # we want to keep track of sequence headers
            rna_header = k
            rna_seq = seqdict.get(k).upper()
        elif re.search(gen_string,k):
            gen_header = k
            gen_seq = seqdict.get(k).upper()
        else:
            ref_header = k
            ref_seq = seqdict.get(k).upper()

    san_gen_seq = strings.sanitize(gen_seq)
    san_ref_seq = strings.sanitize(ref_seq)
    # Steal the RefPair class, but we do not care about the name for
    # writing to output, use "name" as a placeholder
    ref_pair = classes.RefPair(san_ref_seq,san_gen_seq,"name")

    gen_start = 'NA' # Can't use False, as zero index also evaluates
    ref_start = 'NA'
    gen_list = []
    ref_list = []
    for i, (rg,rf) in enumerate(zip(gen_seq,ref_seq)):
        #print "position: %d" % (i+1)
        #print "reference nucleotide: %s" % (rf)
        #print "reference codon position: %d" % (ref_pair.index_rposition())
        #print "genomic nucleotide is: %s" % (rg)
        #print "genomic codon position: %d" % (ref_pair.index_gposition())