def extract_breakpoint_sequence(tx_name_5p, tx_end_5p, 
                                tx_name_3p, tx_start_3p, 
                                ref_fa, max_read_length,
                                homology_mismatches):
    tx_start_5p = max(0, tx_end_5p - max_read_length + 1)
    tx_end_3p = tx_start_3p + max_read_length - 1
    # fetch sequence
    seq5p = ref_fa.fetch(tx_name_5p, tx_start_5p, tx_end_5p).upper()
    seq3p = ref_fa.fetch(tx_name_3p, tx_start_3p, tx_end_3p).upper()
    # pad sequence if too short
    if len(seq5p) < (max_read_length - 1):
        logging.warning("Could not extract sequence of length >%d from "
                        "5' partner at %s:%d-%d, only retrieved "
                        "sequence of length %d" % 
                        (max_read_length-1, tx_name_5p, tx_start_5p, 
                         tx_end_5p, len(seq5p)))
        # pad sequence
        padding = (max_read_length - 1) - len(seq5p)
        seq5p = ("N" * padding) + seq5p
    if len(seq3p) < max_read_length - 1:
        logging.warning("Could not extract sequence of length >%d from "
                        "3' partner at %s:%d-%d, only retrieved "
                        "sequence of length %d" % 
                        (max_read_length-1, tx_name_3p, tx_start_3p, 
                         tx_end_3p, len(seq3p)))
        # pad sequence
        padding = (max_read_length - 1) - len(seq3p)
        seq3p = seq3p + ("N" * padding)
    # if 5' partner continues along its normal transcript
    # without fusing, get the sequence that would result
    homolog_end_5p = tx_end_5p + max_read_length - 1
    homolog_seq_5p = ref_fa.fetch(tx_name_5p, tx_end_5p, homolog_end_5p).upper()
    # if 3' partner were to continue in the 5' direction,
    # grab the sequence that would be produced
    homolog_start_3p = max(0, tx_start_3p - max_read_length + 1)
    homolog_seq_3p = ref_fa.fetch(tx_name_3p, homolog_start_3p, tx_start_3p).upper()
    # count number of bases in common between downstream 5' sequence
    # and the sequence of the 3' partner in the chimera
    homology_right = calc_homology(homolog_seq_5p, seq3p, 
                                   homology_mismatches)
    # count number of bases in common between upstream 3' sequence
    # and the sequence of the 5' partner in the chimera
    homology_left = calc_homology(homolog_seq_3p[::-1], seq5p[::-1],
                                  homology_mismatches)
    return seq5p, seq3p, homology_left, homology_right
Example #2
0
def extract_breakpoint_sequence(tx_id_5p, tx_end_5p, tx_id_3p, tx_start_3p,
                                ref_fa, max_read_length, homology_mismatches):
    tx_start_5p = max(0, tx_end_5p - max_read_length + 1)
    tx_end_3p = tx_start_3p + max_read_length - 1
    # fetch sequence
    seq5p = ref_fa.fetch(tx_id_5p, tx_start_5p, tx_end_5p).upper()
    seq3p = ref_fa.fetch(tx_id_3p, tx_start_3p, tx_end_3p).upper()
    # pad sequence if too short
    if len(seq5p) < (max_read_length - 1):
        logging.warning("Could not extract sequence of length >%d from "
                        "5' partner at %s:%d-%d, only retrieved "
                        "sequence of length %d" %
                        (max_read_length - 1, tx_id_5p, tx_start_5p, tx_end_5p,
                         len(seq5p)))
        # pad sequence
        padding = (max_read_length - 1) - len(seq5p)
        seq5p = ("N" * padding) + seq5p
    if len(seq3p) < max_read_length - 1:
        logging.warning("Could not extract sequence of length >%d from "
                        "3' partner at %s:%d-%d, only retrieved "
                        "sequence of length %d" %
                        (max_read_length - 1, tx_id_3p, tx_start_3p, tx_end_3p,
                         len(seq3p)))
        # pad sequence
        padding = (max_read_length - 1) - len(seq3p)
        seq3p = seq3p + ("N" * padding)
    # if 5' partner continues along its normal transcript
    # without fusing, get the sequence that would result
    homolog_end_5p = tx_end_5p + max_read_length - 1
    homolog_seq_5p = ref_fa.fetch(tx_id_5p, tx_end_5p, homolog_end_5p).upper()
    # if 3' partner were to continue in the 5' direction,
    # grab the sequence that would be produced
    homolog_start_3p = max(0, tx_start_3p - max_read_length + 1)
    homolog_seq_3p = ref_fa.fetch(tx_id_3p, homolog_start_3p,
                                  tx_start_3p).upper()
    # count number of bases in common between downstream 5' sequence
    # and the sequence of the 3' partner in the chimera
    homology_right = calc_homology(homolog_seq_5p, seq3p, homology_mismatches)
    # count number of bases in common between upstream 3' sequence
    # and the sequence of the 5' partner in the chimera
    homology_left = calc_homology(homolog_seq_3p[::-1], seq5p[::-1],
                                  homology_mismatches)
    return seq5p, seq3p, homology_left, homology_right
Example #3
0
 def testHomology(self):
     a = "AAAAGGGGTTTTCCCC"
     b = "AAAAGGGGTTTTCCCC"
     self.assertEquals(calc_homology(a, b, 0), 16)
     b = "AAAAGGGGTTTTCCCG"
     self.assertEquals(calc_homology(a, b, 0), 15)
     b = "AAATTTGGTTTTCCCC"
     self.assertEquals(calc_homology(a, b, 0), 3)
     self.assertEquals(calc_homology(a, b, 1), 4)
     self.assertEquals(calc_homology(a, b, 2), 5)
     self.assertEquals(calc_homology(a, b, 3), 16)
Example #4
0
 def testHomology(self):
     a = "AAAAGGGGTTTTCCCC"
     b = "AAAAGGGGTTTTCCCC"
     self.assertEquals(calc_homology(a, b, 0), 16)
     b = "AAAAGGGGTTTTCCCG"
     self.assertEquals(calc_homology(a, b, 0), 15)
     b = "AAATTTGGTTTTCCCC"
     self.assertEquals(calc_homology(a, b, 0), 3)
     self.assertEquals(calc_homology(a, b, 1), 4)
     self.assertEquals(calc_homology(a, b, 2), 5)
     self.assertEquals(calc_homology(a, b, 3), 16)