def extract_breakpoint_sequence(tx_name_5p, tx_end_5p, tx_name_3p, tx_start_3p, ref_fa, max_read_length, homology_mismatches): tx_start_5p = max(0, tx_end_5p - max_read_length + 1) tx_end_3p = tx_start_3p + max_read_length - 1 # fetch sequence seq5p = ref_fa.fetch(tx_name_5p, tx_start_5p, tx_end_5p).upper() seq3p = ref_fa.fetch(tx_name_3p, tx_start_3p, tx_end_3p).upper() # pad sequence if too short if len(seq5p) < (max_read_length - 1): logging.warning("Could not extract sequence of length >%d from " "5' partner at %s:%d-%d, only retrieved " "sequence of length %d" % (max_read_length-1, tx_name_5p, tx_start_5p, tx_end_5p, len(seq5p))) # pad sequence padding = (max_read_length - 1) - len(seq5p) seq5p = ("N" * padding) + seq5p if len(seq3p) < max_read_length - 1: logging.warning("Could not extract sequence of length >%d from " "3' partner at %s:%d-%d, only retrieved " "sequence of length %d" % (max_read_length-1, tx_name_3p, tx_start_3p, tx_end_3p, len(seq3p))) # pad sequence padding = (max_read_length - 1) - len(seq3p) seq3p = seq3p + ("N" * padding) # if 5' partner continues along its normal transcript # without fusing, get the sequence that would result homolog_end_5p = tx_end_5p + max_read_length - 1 homolog_seq_5p = ref_fa.fetch(tx_name_5p, tx_end_5p, homolog_end_5p).upper() # if 3' partner were to continue in the 5' direction, # grab the sequence that would be produced homolog_start_3p = max(0, tx_start_3p - max_read_length + 1) homolog_seq_3p = ref_fa.fetch(tx_name_3p, homolog_start_3p, tx_start_3p).upper() # count number of bases in common between downstream 5' sequence # and the sequence of the 3' partner in the chimera homology_right = calc_homology(homolog_seq_5p, seq3p, homology_mismatches) # count number of bases in common between upstream 3' sequence # and the sequence of the 5' partner in the chimera homology_left = calc_homology(homolog_seq_3p[::-1], seq5p[::-1], homology_mismatches) return seq5p, seq3p, homology_left, homology_right
def extract_breakpoint_sequence(tx_id_5p, tx_end_5p, tx_id_3p, tx_start_3p, ref_fa, max_read_length, homology_mismatches): tx_start_5p = max(0, tx_end_5p - max_read_length + 1) tx_end_3p = tx_start_3p + max_read_length - 1 # fetch sequence seq5p = ref_fa.fetch(tx_id_5p, tx_start_5p, tx_end_5p).upper() seq3p = ref_fa.fetch(tx_id_3p, tx_start_3p, tx_end_3p).upper() # pad sequence if too short if len(seq5p) < (max_read_length - 1): logging.warning("Could not extract sequence of length >%d from " "5' partner at %s:%d-%d, only retrieved " "sequence of length %d" % (max_read_length - 1, tx_id_5p, tx_start_5p, tx_end_5p, len(seq5p))) # pad sequence padding = (max_read_length - 1) - len(seq5p) seq5p = ("N" * padding) + seq5p if len(seq3p) < max_read_length - 1: logging.warning("Could not extract sequence of length >%d from " "3' partner at %s:%d-%d, only retrieved " "sequence of length %d" % (max_read_length - 1, tx_id_3p, tx_start_3p, tx_end_3p, len(seq3p))) # pad sequence padding = (max_read_length - 1) - len(seq3p) seq3p = seq3p + ("N" * padding) # if 5' partner continues along its normal transcript # without fusing, get the sequence that would result homolog_end_5p = tx_end_5p + max_read_length - 1 homolog_seq_5p = ref_fa.fetch(tx_id_5p, tx_end_5p, homolog_end_5p).upper() # if 3' partner were to continue in the 5' direction, # grab the sequence that would be produced homolog_start_3p = max(0, tx_start_3p - max_read_length + 1) homolog_seq_3p = ref_fa.fetch(tx_id_3p, homolog_start_3p, tx_start_3p).upper() # count number of bases in common between downstream 5' sequence # and the sequence of the 3' partner in the chimera homology_right = calc_homology(homolog_seq_5p, seq3p, homology_mismatches) # count number of bases in common between upstream 3' sequence # and the sequence of the 5' partner in the chimera homology_left = calc_homology(homolog_seq_3p[::-1], seq5p[::-1], homology_mismatches) return seq5p, seq3p, homology_left, homology_right
def testHomology(self): a = "AAAAGGGGTTTTCCCC" b = "AAAAGGGGTTTTCCCC" self.assertEquals(calc_homology(a, b, 0), 16) b = "AAAAGGGGTTTTCCCG" self.assertEquals(calc_homology(a, b, 0), 15) b = "AAATTTGGTTTTCCCC" self.assertEquals(calc_homology(a, b, 0), 3) self.assertEquals(calc_homology(a, b, 1), 4) self.assertEquals(calc_homology(a, b, 2), 5) self.assertEquals(calc_homology(a, b, 3), 16)