def unaligned_error_list(length, error_p): e_dict = {} error_rate = { (0, 0.4): "match", (0.4, 0.7): "mis", (0.7, 0.85): "ins", (0.85, 1): "del" } pos = 0 last_is_ins = False while pos < length: p = random.random() for k_error in error_rate.keys(): if k_error[0] <= p < k_error[1]: error_type = error_rate[k_error] break if error_type == "match": step = 1 elif error_type == "mis": step = mm.pois_geom(error_p["mis"][0], error_p["mis"][2], error_p["mis"][3]) e_dict[pos] = ["mis", step] elif error_type == "ins": step = mm.wei_geom(error_p["ins"][0], error_p["ins"][1], error_p["ins"][2], error_p["ins"][3]) if last_is_ins: e_dict[pos + 0.1][1] += step else: e_dict[pos + 0.1] = ["ins", step] last_is_ins = True else: step = mm.wei_geom(error_p["del"][0], error_p["del"][1], error_p["del"][2], error_p["del"][3]) e_dict[pos] = ["del", step] if error_type != "ins": pos += step last_is_ins = False if pos > length: length = pos return length, e_dict
def unaligned_error_list(length, error_p): e_dict = {} error_rate = {(0, 0.4): "match", (0.4, 0.7): "mis", (0.7, 0.85): "ins", (0.85, 1): "del"} pos = 0 last_is_ins = False while pos < length: p = random.random() for k_error in error_rate.keys(): if k_error[0] <= p < k_error[1]: error_type = error_rate[k_error] break if error_type == "match": step = 1 elif error_type == "mis": step = mm.pois_geom(error_p["mis"][0], error_p["mis"][2], error_p["mis"][3]) e_dict[pos] = ["mis", step] elif error_type == "ins": step = mm.wei_geom(error_p["ins"][0], error_p["ins"][1], error_p["ins"][2], error_p["ins"][3]) if last_is_ins: e_dict[pos + 0.1][1] += step else: e_dict[pos + 0.1] = ["ins", step] last_is_ins = True else: step = mm.wei_geom(error_p["del"][0], error_p["del"][1], error_p["del"][2], error_p["del"][3]) e_dict[pos] = ["del", step] if error_type != "ins": pos += step last_is_ins = False if pos > length: length = pos return length, e_dict
def error_list(m_ref, m_model, m_ht_list, error_p, trans_p): # l_old is the original length, and l_new is used to control the new length after introducing errors l_new = m_ref pos = 0 e_dict = {} middle_ref = m_ref prev_error = "start" # The first match come from m_ht_list p = random.random() k1 = list(m_ht_list.keys())[0] for k2, v2 in m_ht_list[k1].items(): if k2[0] < p <= k2[1]: prev_match = int(np.floor((p - k2[0])/(k2[1] - k2[0]) * (v2[1] - v2[0]) + v2[0])) if prev_match < 2: prev_match = 2 pos += prev_match # Select an error, then the step size, and then a match and so on so forth. while pos < middle_ref: # pick the error based on Markov chain p = random.random() for k in trans_p[prev_error].keys(): if k[0] <= p < k[1]: error = trans_p[prev_error][k] break if error == "mis": step = mm.pois_geom(error_p["mis"][0], error_p["mis"][2], error_p["mis"][3]) elif error == "ins": step = mm.wei_geom(error_p[error][0], error_p[error][1], error_p[error][2], error_p[error][3]) l_new += step else: step = mm.wei_geom(error_p[error][0], error_p[error][1], error_p[error][2], error_p[error][3]) l_new -= step if error != "ins": e_dict[pos] = [error, step] pos += step if pos >= middle_ref: l_new += pos - middle_ref middle_ref = pos else: e_dict[pos - 0.5] = [error, step] prev_error = error # Randomly select a match length for k1 in m_model.keys(): if k1[0] <= prev_match < k1[1]: break p = random.random() for k2, v2 in m_model[k1].items(): if k2[0] < p <= k2[1]: step = int(np.floor((p - k2[0])/(k2[1] - k2[0]) * (v2[1] - v2[0]) + v2[0])) break # there are no two 0 base matches together if prev_match == 0 and step == 0: step = 1 prev_match = step if pos + prev_match > middle_ref: l_new += pos + prev_match - middle_ref middle_ref = pos + prev_match pos += prev_match if prev_match == 0: prev_error += "0" return l_new, middle_ref, e_dict
def error_list(m_ref, m_model, m_ht_list, error_p, trans_p): # l_old is the original length, and l_new is used to control the new length after introducing errors l_new = m_ref pos = 0 e_dict = {} middle_ref = m_ref prev_error = "start" # The first match come from m_ht_list p = random.random() k1 = m_ht_list.keys()[0] for k2, v2 in m_ht_list[k1].items(): if k2[0] < p <= k2[1]: prev_match = int(np.floor((p - k2[0]) / (k2[1] - k2[0]) * (v2[1] - v2[0]) + v2[0])) if prev_match < 2: prev_match = 2 pos += prev_match # Select an error, then the step size, and then a match and so on so forth. while pos < middle_ref: # pick the error based on Markov chain p = random.random() for k in trans_p[prev_error].keys(): if k[0] <= p < k[1]: error = trans_p[prev_error][k] break if error == "mis": step = mm.pois_geom(error_p["mis"][0], error_p["mis"][2], error_p["mis"][3]) elif error == "ins": step = mm.wei_geom(error_p[error][0], error_p[error][1], error_p[error][2], error_p[error][3]) l_new += step else: step = mm.wei_geom(error_p[error][0], error_p[error][1], error_p[error][2], error_p[error][3]) l_new -= step if error != "ins": e_dict[pos] = [error, step] pos += step if pos >= middle_ref: l_new += pos - middle_ref middle_ref = pos else: e_dict[pos - 0.5] = [error, step] prev_error = error # Randomly select a match length for k1 in m_model.keys(): if k1[0] <= prev_match < k1[1]: break p = random.random() for k2, v2 in m_model[k1].items(): if k2[0] < p <= k2[1]: step = int(np.floor((p - k2[0]) / (k2[1] - k2[0]) * (v2[1] - v2[0]) + v2[0])) break # there are no two 0 base matches together if prev_match == 0 and step == 0: step = 1 prev_match = step if pos + prev_match > middle_ref: l_new += pos + prev_match - middle_ref middle_ref = pos + prev_match pos += prev_match if prev_match == 0: prev_error += "0" return l_new, middle_ref, e_dict
def error_list(m_ref, m_list, m_ht_list, error_p): # l_old is the original length, and l_new is used to control the new length after introducing errors l_new = m_ref pos = 0 e_dict = {} errors = {(0, 0.51027): "mis", (0.51027, 0.72467): "ins", (0.72467, 1): "del"} transition_pr = {"mis": {(0, 0.50105): "mis", (0.50105, 0.72018): "ins", (0.72018, 1): "del"}, "ins": {(0, 0.52186): "mis", (0.52186, 0.82170): "ins", (0.82170, 1): "del"}, "del": {(0, 0.51752): "mis", (0.51752, 0.65589): "ins", (0.65589, 1): "del"}} middle_ref = m_ref last_error = "" # The first match and last match come from m_ht_list p = random.random() for k in m_ht_list.keys(): if k[0] <= p < k[1]: step = m_ht_list[k] break pos += step p = random.random() for k in m_ht_list.keys(): if k[0] <= p < k[1]: last_match = m_ht_list[k] break # Select an error, then the step size, and then a match and so on so forth. while pos < middle_ref - last_match: if last_error == "": # the first error of a read is randomly selected based on the rate of different errors. p = random.random() for k in errors.keys(): if k[0] <= p < k[1]: error = errors[k] break else: if step != 0: # the rest errors are selected based on Markov chain p = random.random() for k in transition_pr[last_error].keys(): if k[0] <= p < k[1]: error = transition_pr[last_error][k] break # if there are two consecutive errors, if the first one is mis, the second can be ins or del elif last_error == "mis": p = random.random() if p <= 0.44386: error = "ins" else: error = "del" # if the first one is ins or del, the following one can only be a mis elif last_error in ["ins", "del"]: error = "mis" if error == "mis": step = mm.pois_geom(error_p["mis"][0], error_p["mis"][2], error_p["mis"][3]) else: step = mm.wei_geom(error_p[error][0], error_p[error][1], error_p[error][2], error_p[error][3]) if error == "ins": l_new += step else: l_new -= step if error != "ins": e_dict[pos] = [error, step] pos += step if pos >= middle_ref - last_match: l_new += pos + last_match - middle_ref middle_ref = pos + last_match break else: e_dict[pos - 0.5] = [error, step] if pos == middle_ref - last_match: break last_error = error # Randomly select a match length p = random.random() for k in m_list.keys(): if k[0] <= p < k[1]: step = m_list[k] break if pos + step > middle_ref - last_match: l_new += pos + step + last_match - middle_ref middle_ref = pos + step + last_match pos += step return l_new, middle_ref, e_dict