if re.search(rna_string,k):
        rna_seq = seqdict.get(k).upper()
    elif re.search(gen_string,k):
        gen_seq = seqdict.get(k).upper()
    else:
        ref_seq = seqdict.get(k).upper()

# Need to find beginning and end of aligned region
i = 0
j = 0
# Need to keep track of gen index
gen_index = 0
try:
    # Compare genomic and RNA sequences to find local regions of good
    # similarity, this is taken as the start and end of aligned region
    while not sequence.compare_seqs((strings.gulp(rna_seq, i, size)),
            (strings.gulp(gen_seq, i, size)), num_equal):
        if gen_seq[i] != '-':
            # If there is actually an amino acid in the genomic sequence
            # we need to move the index ahead
            gen_index += 1
        i += 1
    while not sequence.compare_seqs((strings.gulp(rna_seq[::-1], j, size)),
            (strings.gulp(gen_seq[::-1], j, size)), num_equal):
        j += 1
# If we get an index error then we cannot find start and end of both sequences
except(IndexError):
    print "Could not discern aligned part of sequences"
    # Exit cleanly
    sys.exit(0)
Example #2
0
 def test_gulp_long_string(self):
     long_string = "This is a very long string"
     sub_string = strings.gulp(long_string, 0, 7)
     self.assertEqual(sub_string, "This is")
gen_string = str(args.genomic)
# Sequences must be in upper-case
for k in seqdict.keys():
    if re.search(rna_string, k):
        rna_seq = seqdict.get(k).upper()
    elif re.search(gen_string, k):
        gen_seq = seqdict.get(k).upper()

# Need to find beginning and end of aligned region
i = 0
j = 0
try:
    # Compare genomic and RNA sequences to find local regions of good
    # similarity, this is taken as the start and end of aligned region
    while not sequence.compare_seqs(
        (strings.gulp(rna_seq, i, size)),
        (strings.gulp(gen_seq, i, size)), num_equal):
        #while not sequence.compare_seqs(gen_seq[i], rna_seq[i]):
        i += 1
    while not sequence.compare_seqs(
        (strings.gulp(rna_seq[::-1], j, size)),
        (strings.gulp(gen_seq[::-1], j, size)), num_equal):
        #while not sequence.compare_seqs(gen_seq[-j], rna_seq[-j]):
        j += 1
# If we get an index error then we cannot find start and end of both sequences
except (IndexError):
    print "Could not discern aligned part of sequences"
    # Exit cleanly
    sys.exit(0)

# Once we know the start and end, simply chop off everything else
Example #4
0
 def test_gulp_index_largeer_than_string(self):
     long_string = "Some string"
     sub_string = strings.gulp(long_string, 0, 20)
     self.assertEqual(sub_string, "Some string")
Example #5
0
 def test_gulp_nothingggg(self):
     sub_string = strings.gulp("", 0, 0)
     self.assertEqual(sub_string, "")
Example #6
0
 def test_gulp_long_string_0_length(self):
     long_string = "This is a very long string"
     sub_string = strings.gulp(long_string, 0, 0)
     self.assertEqual(sub_string, "")
Example #7
0
 def test_gulp_long_string_reverse_indices(self):
     long_string = "This is a very long string"
     sub_string = strings.gulp(long_string, 7, 0)
     self.assertEqual(sub_string, "")
    elif re.search(gen_string,k):
        gen_seq = seqdict.get(k).upper()

# We directly compare aligned sequences, but class implementation uses
# unaligned sequences (i.e. no gap characters '-')
san_rna_seq = strings.sanitize(rna_seq)
san_gen_seq = strings.sanitize(gen_seq)
seq_pair = classes.SeqPair(san_rna_seq,san_gen_seq,name)

# Find beginning and end of aligned region
i = 0
j = 0
try:
    # Compare genomic and RNA sequences to find local regions of good
    # similarity, this is taken as the start and end of aligned region
    while not sequence.compare_seqs((strings.gulp(rna_seq, i, size)),
            (strings.gulp(gen_seq, i, size)), num_equal):
        # If we find residues in either sequence, we need to increment
        # certain class values accordingly
        if gen_seq[i] != '-':
            seq_pair.incr_all()
        if rna_seq[i] != '-':
            seq_pair.incr_mrna()
        i += 1
    while not sequence.compare_seqs((strings.gulp(rna_seq[::-1], j, size)),
            (strings.gulp(gen_seq[::-1], j, size)), num_equal):
        j += 1
# If we get an index error then we cannot find start and end of both sequences
except(IndexError):
    print "Could not discern aligned part of sequences for gene " + str(name)
    # Exit cleanly
Example #9
0
        mnuc = seq_pair.lookup_mnuc()
        gcod = seq_pair.lookup_gcodon()
        mcod = seq_pair.lookup_mcodon()
        gaa = seq_pair.lookup_gaa()
        maa = seq_pair.lookup_maa()
        scr = (matrices.Blosum62(gaa, maa).sub_score())
        non_syn = sequence.check_nonsynonymous_edit(cpos, gcod, mnuc)

        # We can identify whether the residue is present in a region of local
        # 'T' concentration, i.e. polyT
        if args.polyt:
            # Test whether the base is in a region of 4 or more sequential 'T's
            is_polyt = "N"
            # Only look at first seven bases at the start
            if i <= 4:
                polyt_test_seq = strings.gulp(new_gen_seq, 0, 7)
            # Only look at last seven bases at the end
            elif i >= len(new_gen_seq) - 4:
                polyt_test_seq = strings.gulp(new_gen_seq,
                        len(new_gen_seq)-7, 7)
            # In the middle take 3 bases on either side (seven total)
            else:
                polyt_test_seq = strings.gulp(new_gen_seq, i-3, 7)
            # Determine whether the region fits the definition of "polyT"
            if sequence.polyT(polyt_test_seq):
                is_polyt = "Y"

            # Test whether the base is present in region of X % 'T'
            is_polyt_percent = "N"
            percent_polyt_seqs = []
            for y in range(10): # i.e. for 10 base window
Example #10
0
 def test_gulp_long_string(self):
     long_string = "This is a very long string"
     sub_string = strings.gulp(long_string, 0, 7)
     self.assertEqual(sub_string, "This is")
Example #11
0
# If we want to calculate protein sequence similarity over the same
# stretch, we have to accept some inherent complexity
if args.protein:
    # Note we are calculating similarity, not identity
    # This is largely because of the difference between comparing
    # four nucleotides versus 20 amino acids
    gen_similarity_list = []
    for i, (rg,rr) in enumerate(zip(new_gen_seq, new_ref_seq)):
        similarity_sum = 0.0

        # It is vital to know the current codon position
        rpos = ref_pair.index_rposition()
        gpos = ref_pair.index_gposition()
        # Get the reference sequence for the window
        rnuc_seq = strings.gulp(new_ref_seq, i, int(window_size))
        # Using the right RF, translate the sequence
        raa_seq = sequence.translate(rnuc_seq,rpos)
        # Repeat this for the genomic sequence
        gnuc_seq = strings.gulp(new_gen_seq, i, int(window_size))
        gaa_seq = sequence.translate(gnuc_seq,gpos)

        if (len(rnuc_seq) == int(window_size) and
                len(gnuc_seq) == int(window_size)): # Sanity check!
            # If the lengths aren't equal, align them
            if len(raa_seq) != len(gaa_seq):
                raa_seq,gaa_seq = sequence_alignment.affine_align(raa_seq,gaa_seq)
            # Whether we align or not, continue on...
            # Determine how similar raa_seq and gaa_seq are
            for raa,gaa in zip(raa_seq,gaa_seq):
                # Gaps are neutral
Example #12
0
 def test_gulp_nothingggg(self):
     sub_string = strings.gulp("", 0,0)
     self.assertEqual(sub_string, "")
Example #13
0
 def test_gulp_index_largeer_than_string(self):
     long_string = "Some string"
     sub_string = strings.gulp(long_string, 0, 20)
     self.assertEqual(sub_string, "Some string")
Example #14
0
 def test_gulp_long_string_reverse_indices(self):
     long_string = "This is a very long string"
     sub_string = strings.gulp(long_string, 7, 0)
     self.assertEqual(sub_string, "")
Example #15
0
 def test_gulp_long_string_0_length(self):
     long_string = "This is a very long string"
     sub_string = strings.gulp(long_string, 0, 0)
     self.assertEqual(sub_string, "")
Example #16
0
        ref_pair2 = classes.RefPair(san_ref_seq,san_gen_seq,name)
# To use only synonymous edits, we need to have the corresponding
# amino acids for both the genomic and RNA sequences
# This is also the case for comparing RNA to reference
if args.synonymous:
    san_gen_seq = strings.sanitize(gen_seq)
    san_rna_seq = strings.sanitize(rna_seq)
    seq_pair = classes.SeqPair(san_rna_seq,san_gen_seq,name)

# Find beginning and end of aligned region
i = 0
j = 0
try:
    # Compare genomic and RNA sequences to find local regions of good
    # similarity, this is taken as the start and end of aligned region
    while not sequence.compare_seqs((strings.gulp(rna_seq, i, size)),
            (strings.gulp(gen_seq, i, size)), num_equal):
        if gen_seq[i] != '-':
            # If we are using classes, need to update them as we go
            if args.protein:
                ref_pair.incr_all_gen()
                if args.both:
                    ref_pair2.incr_all_gen()
            if args.synonymous:
                seq_pair.incr_all()
        if ref_seq[i] != '-':
            if args.protein:
                ref_pair.incr_all_ref()
                if args.both:
                    ref_pair2.incr_all_ref()
        if rna_seq[i] != '-':