예제 #1
0
def map_pair(seq1, seq1_len, seq2, seq2_len, k):
    d1 = 1
    d2 = 1
    p1 = -1
    p2 = -1
    c1 = -1
    c2 = -1

    found1 = 0
    for i in range(seq1_len - k + 1):
        kit1 = seq1[i:i + k]
        kit1_rep = kmer_rep(kit1)
        if kit1_rep in kmer_str_dict:
            found1 = 1
            kit1_info = kmer_str_dict[kit1_rep]
            c1 = kit1_info.contig_id
            if (kit1_rep == kit1) == kit1_info.sense_in_contig:
                p1 = kit1_info.pos_in_contig - i
                d1 = 1
            else:
                p1 = kit1_info.pos_in_contig + k + i
                d1 = 0
            # print(seq1)
            # print(kit1_info.pos_in_contig)
            break
        else:
            pass
    if not found1:
        return -1
    found2 = 0
    for i in range(seq2_len - k + 1):
        kit2 = seq2[i:i + k]
        kit2_rep = kmer_rep(kit2)
        if kit2_rep in kmer_str_dict:
            found2 = 1
            kit2_info = kmer_str_dict[kit2_rep]
            c2 = kit2_info.contig_id
            if (kit2_rep == kit2) == kit2_info.sense_in_contig:
                p2 = kit2_info.pos_in_contig - i
                d2 = 1
            else:
                p2 = kit2_info.pos_in_contig + k + i
                d2 = 0
            # print(kit2_info.pos_in_contig)
            break
        else:
            pass
    if not found2:
        return -1
    if c1 != c2:
        return -1
    if d1 and d2:
        return -1
    if not d1 and not d2:
        return -1
    if p1 > p2:
        return p1 - p2
    else:
        return p2 - p1
예제 #2
0
파일: ecs.py 프로젝트: felixlyd/py-kallisto
def fix_contigs(k):
    perfect_contig_num = 0
    for contig_id in range(len(contig_find_trans_list)):
        all_ok = 1
        # 如前文所言,没有找错的
        n_of_kmer_in_curr_contig = contig_list[contig_id].n_of_kmer
        for each_trans_info in contig_find_trans_list[contig_id]:
            if each_trans_info.start_in_tran != 0 or each_trans_info.stop_in_tran != n_of_kmer_in_curr_contig:
                all_ok = 0
                # 说明这里有找错的
        if all_ok:
            perfect_contig_num = perfect_contig_num + 1
        else:
            tmp_list = []
            for each_trans_info in contig_find_trans_list[contig_id]:
                tmp_list.append(each_trans_info.start_in_tran)
                tmp_list.append(each_trans_info.stop_in_tran)
            break_points = sorted(list(set(tmp_list)))
            old_contig_seq = contig_list[contig_id].seq
            old_trans_info = contig_find_trans_list[contig_id]
            new_contig = Contig()
            for j in range(len(break_points) - 1):
                new_contig.seq = old_contig_seq[
                    break_points[j]:break_points[j + 1] - break_points[j] + k -
                    1]
                new_contig.n_of_kmer = break_points[j + 1] - break_points[j]
                if j == 0:
                    new_contig.id = contig_id
                    contig_list[contig_id] = new_contig
                else:
                    new_contig.id = len(contig_list)
                    contig_list.append(new_contig)
                    ecs_list.append(-1)
                for m in range(len(new_contig.seq) - k + 1):
                    curr_kmer_in_contig = new_contig.seq[m:m + k]
                    curr_rep = kmer_rep(curr_kmer_in_contig)
                    if curr_rep in kmer_str_dict:
                        kmer_str_dict[curr_rep].contig_id = new_contig.id
                        kmer_str_dict[curr_rep].pos_in_contig = m
                        kmer_str_dict[
                            curr_rep].n_of_kmer_in_contig = new_contig.n_of_kmer
                    else:
                        program_stop("ecs.py-4")
                new_trans_info_list = []
                for each_trans_info in old_trans_info:
                    if not (each_trans_info.start_in_tran >=
                            break_points[j + 1] or
                            each_trans_info.stop_in_tran <= break_points[j]):
                        new_trans_info = ContigFindTrans()
                        new_trans_info.sense_in_tran = each_trans_info.sense_in_tran
                        new_trans_info.tran_id = each_trans_info.tran_id
                        new_trans_info.start_in_tran = 0
                        new_trans_info.stop_in_tran = new_contig.n_of_kmer
                        new_trans_info_list.append(new_trans_info)
                if j == 0:
                    contig_find_trans_list[contig_id] = new_trans_info_list
                else:
                    contig_find_trans_list.append(new_trans_info_list)
예제 #3
0
def fwstep(kmer):
    global fw_step_kmer
    fw_count = 0
    for fw_kmer in contig_fw(kmer):
        curr_rep = kmer_rep(fw_kmer)
        if curr_rep in kmer_str_dict:
            fw_count = fw_count + 1
            curr_fw = fw_kmer
        if fw_count > 1:
            return 0
    if fw_count != 1:
        return 0
    bw_count = sum(
        kmer_rep(tmp) in kmer_str_dict for tmp in contig_bw(curr_fw))
    if bw_count > 1:
        return 0
    if bw_count != 1:
        return 0
    else:
        if curr_fw != kmer:
            fw_step_kmer = curr_fw
            return 1
        else:
            return 0
예제 #4
0
파일: ecs.py 프로젝트: felixlyd/py-kallisto
def build_ecs(seqs, k):
    contig_find_trans_dict = {}
    for i in range(len(contig_list)):
        contig_find_trans_dict[i] = []
    # 遍历基因序列
    for i in range(len(seqs)):
        each_ec = [i]
        ec_map.append(each_ec)
        ec_inv_dict[i] = each_ec
        seq = seqs[i]
        n_of_kmers_of_seq = len(seq) - k + 1
        j = 0
        while j < n_of_kmers_of_seq:
            curr_kmer = seq[j:j + k]
            curr_kmer = str(curr_kmer)
            curr_rep = kmer_rep(curr_kmer)
            curr_twin = kmer_twin(curr_kmer)
            if curr_rep in kmer_str_dict:
                curr_contig_id = kmer_str_dict[curr_rep].contig_id
                curr_contig = contig_list[curr_contig_id]

                kmers_in_contig = []
                # 遍历curr_contig里面的每一个kmer
                for m in range(len(curr_contig.seq) - k + 1):
                    curr_kmer_in_contig = curr_contig.seq[m:m + k]
                    kmers_in_contig.append(curr_kmer_in_contig)
                kmers_in_contig_len = len(kmers_in_contig)

                contig_find_trans = ContigFindTrans()
                contig_find_trans.tran_id = i
                if (curr_kmer == curr_rep
                    ) == kmer_str_dict[curr_rep].sense_in_contig:
                    contig_find_trans.sense_in_tran = 1
                    contig_find_trans.start_in_tran = kmers_in_contig.index(
                        curr_kmer)
                    if kmers_in_contig_len - contig_find_trans.start_in_tran > n_of_kmers_of_seq - j:
                        # 说明这个contig不来自这个seq
                        contig_find_trans.stop_in_tran = contig_find_trans.start_in_tran + n_of_kmers_of_seq - j
                        # [start,stop)这里这个长度肯定是错误的,正好后面要用到
                        j = n_of_kmers_of_seq
                        # 到末尾,也就是结束
                    else:
                        contig_find_trans.stop_in_tran = kmers_in_contig_len
                        # contig属于这个seq,而且长度就是contig的长度
                        j = j + contig_find_trans.stop_in_tran - contig_find_trans.start_in_tran
                        # 下一个从紧接着当前contig的下一个contig开始
                else:
                    contig_find_trans.sense_in_tran = 0
                    contig_find_trans.stop_in_tran = kmers_in_contig.index(
                        curr_twin) + 1
                    # 因为[start,stop) stop代表的是长度,取不到stop,实际取的是stop-1
                    if contig_find_trans.stop_in_tran > n_of_kmers_of_seq - j:
                        contig_find_trans.start_in_tran = contig_find_trans.stop_in_tran - n_of_kmers_of_seq + j
                        j = n_of_kmers_of_seq
                    else:
                        contig_find_trans.start_in_tran = 0
                        j = j + contig_find_trans.stop_in_tran - contig_find_trans.start_in_tran
                contig_find_trans_dict[curr_contig_id].append(
                    contig_find_trans)
            else:
                print(curr_rep)
                print(j)
                program_stop("ecs.py-2")
    for i in range(len(contig_list)):
        contig_find_trans_list.append(contig_find_trans_dict[i])
    fix_contigs(k)
    for curr_contig_id in range(len(contig_find_trans_list)):
        curr_contig_list = []
        for curr_trans_info in contig_find_trans_list[curr_contig_id]:
            curr_contig_list.append(curr_trans_info.tran_id)
        curr_contig_list = sorted(list(set(curr_contig_list)))
        ec = -1
        if curr_contig_list in ec_inv_dict.values():
            for (key, value) in ec_inv_dict.items():
                if value == curr_contig_list:
                    ec = key
        else:
            ec = len(ec_inv_dict)
            ec_inv_dict[ec] = curr_contig_list
            ec_map.append(curr_contig_list)
        ecs_list[curr_contig_id] = ec
        contig_list[curr_contig_id].ecs_id = ec
    for i in range(len(seqs)):
        seq = seqs[i]
        n_of_kmers_of_seq = len(seq) - k + 1
        tmp_str = ""
        j = 0
        while j < n_of_kmers_of_seq:
            curr_kmer = seq[j:j + k]
            curr_kmer = str(curr_kmer)
            curr_rep = kmer_rep(curr_kmer)
            if curr_rep in kmer_str_dict:
                curr_kmer_info = kmer_str_dict[curr_rep]
                contig_include_trans = ContigIncludeTrans()
                contig_include_trans.tran_id = i
                contig_include_trans.pos_in_tran = j
                contig_include_trans.sense_in_tran = int(((
                    curr_kmer == curr_rep) == curr_kmer_info.sense_in_contig))
                contig_list[curr_kmer_info.contig_id].include_trans.append(
                    contig_include_trans)
                j = j + contig_list[curr_kmer_info.contig_id].n_of_kmer
                # if contig_include_trans.sense_in_tran:
                #     if contig_include_trans.pos_in_tran == 0:
                #         tmp_str = tmp_str + contig_list[curr_kmer_info.contig_id].seq
                #     else:
                #         tmp_str = tmp_str + contig_list[curr_kmer_info.contig_id].seq[k - 1]
                # else:
                #     new_tm_str = str_pair(contig_list[curr_kmer_info.contig_id].seq)
                #     if contig_include_trans.pos_in_tran == 0:
                #         tmp_str = tmp_str + new_tm_str
                #     else:
                #         tmp_str = tmp_str + new_tm_str[k - 1]
            else:
                program_stop("ecs.py-3")
예제 #5
0
def match(curr_read, curr_read_len, k):
    # global num_read
    # num_read = num_read + 1
    # if num_read %2 == 0:
    #     print(num_read)
    v = []
    i = 0
    again = 0
    while i < curr_read_len - k + 1:
        kit = curr_read[i:i + k]
        kit_rep = kmer_rep(kit)
        if kit_rep in kmer_str_dict:
            curr_kit_info = kmer_str_dict[kit_rep]
            pos = i
            v.append((curr_kit_info, pos))
            dist = get_dist(int((kit == kit_rep)), curr_kit_info)
            if dist > 2:
                pos2 = pos + dist
                if pos2 + k >= curr_read_len:
                    # 超界,检查该read的最后一个kmer
                    pos2 = curr_read_len - k
                kit2 = curr_read[pos2:pos2 + k]
                kit2_rep = kmer_rep(kit2)
                if kit2_rep in kmer_str_dict:
                    found2 = 0
                    found2pos = pos + dist
                    kit2_info = kmer_str_dict[kit2_rep]
                    if curr_kit_info.contig_id == kit2_info.contig_id:
                        found2 = 1
                        found2pos = pos + dist
                else:
                    found2 = 1
                    found2pos = pos
                if found2:
                    if found2pos >= curr_read_len - k:
                        v.append((curr_kit_info, curr_read_len - k))
                        return v
                    else:
                        v.append((curr_kit_info, found2pos))
                else:
                    found_middle = 0
                    if dist > 4:
                        middle_pos = int((pos + pos2) / 2)
                        found3pos = pos + dist
                        kit3 = curr_read[middle_pos:middle_pos + k]
                        kit3_rep = kmer_rep(kit3)
                        if kit3_rep in kmer_str_dict:
                            kit3_info = kmer_str_dict[kit3_rep]
                            middle_contig_id = kit3_info.contig_id
                            if middle_contig_id == curr_kit_info.contig_id:
                                found_middle = 1
                                found3pos = middle_pos
                            elif middle_contig_id == kit2_info.contig_id:
                                found_middle = 1
                                found3pos = pos + dist
                            else:
                                pass
                            if found_middle:
                                v.append((kit3_info, found3pos))
                                if pos2 >= curr_read_len - k:
                                    return v
                                else:
                                    i = pos2
                            else:
                                pass
                        else:
                            pass
                    else:
                        pass
                    if not found_middle:
                        i = i + 1
                        again = 1
                        if again:
                            j = 0
                            m = 0
                            while j < curr_read_len and m < curr_read_len:
                                if j == skip:
                                    j = 0
                                if j == 0:
                                    again_kit = curr_read[m:m + k]
                                    again_kit_rep = kmer_rep(again_kit)
                                    if again_kit_rep in kmer_str_dict:
                                        again_kit_info = kmer_str_dict[
                                            again_kit_rep]
                                        v.append((again_kit_info, m))
                                if m == pos2:
                                    again = 0
                                    return v
                                m = m + 1
                                j = j + 1
            else:
                pass
        else:
            pass
        i = i + 1
    return v
예제 #6
0
def build_dbg(seqs, k):
    global fw_step_kmer
    tmp_kmer_map = set()
    # 把序列打断,得到kmer的集合
    for seq in seqs:
        for i in range(len(seq) - k + 1):
            curr_kmer = seq[i:i + k]
            curr_kmer = str(curr_kmer)
            curr_rep = kmer_rep(curr_kmer)
            tmp_kmer_map.add(curr_rep)

    # 给每一个kmer带上一个可以存储它信息的类
    for each_kmer in tmp_kmer_map:
        kmer_info = KmerInfo()
        kmer_str_dict[each_kmer] = kmer_info

    # 把kmer连起来,找contig
    for kmer_key in kmer_str_dict:
        curr_kmer = kmer_key
        curr_kmer_info = kmer_str_dict[kmer_key]
        if curr_kmer_info.contig_id == -1:
            self_loop = 0
            curr_twin = kmer_twin(curr_kmer)
            fw_list = [curr_kmer]
            last_kmer = curr_kmer
            fw_step_kmer = curr_kmer
            while fwstep(fw_step_kmer):
                if fw_step_kmer == curr_kmer:
                    # 该if判断是否成环
                    self_loop = 1
                    break
                    # pass
                    # example(3,"ACTGAC") (5,"ACCAACCA") (5,"TCTGTCTG") (5,"AACAAACA") (5,"CACACA") (5,"ACACAC")
                    # print("begin:",begin,"fw:", fw, "t_kmer", self.tmp_kmer, "seqs:", self.seqs)
                elif fw_step_kmer == curr_twin:
                    self_loop = (len(fw_list) > 1)
                    break
                elif fw_step_kmer == kmer_twin(last_kmer):
                    break
                fw_list.append(fw_step_kmer)
                last_kmer = fw_step_kmer
            fw_step_kmer = curr_twin
            bw_list = []
            first_kmer = curr_twin
            if not self_loop:
                while fwstep(fw_step_kmer):
                    if fw_step_kmer == curr_twin:
                        break
                    elif fw_step_kmer == curr_kmer:
                        break
                    elif fw_step_kmer == kmer_twin(first_kmer):
                        break
                    bw_list.append(fw_step_kmer)
                    first_kmer = fw_step_kmer
            curr_kmer_list = []
            for bw in reversed(bw_list):
                curr_kmer_list.append(kmer_twin(bw))
            for fw in fw_list:
                curr_kmer_list.append(fw)
            contig_list_len = len(curr_kmer_list)
            curr_contig_id = len(contig_list)
            contig = Contig()
            contig.id = curr_contig_id
            contig.n_of_kmer = contig_list_len
            # 存储kmer的信息
            for j in range(contig_list_len):
                each_kmer = curr_kmer_list[j]
                tmp_rep = kmer_rep(each_kmer)
                if tmp_rep in kmer_str_dict:
                    kmer_str_dict[tmp_rep].contig_id = curr_contig_id
                    kmer_str_dict[tmp_rep].pos_in_contig = j
                    kmer_str_dict[
                        tmp_rep].n_of_kmer_in_contig = contig_list_len
                    if tmp_rep == each_kmer:
                        kmer_str_dict[tmp_rep].sense_in_contig = 1
                    else:
                        kmer_str_dict[tmp_rep].sense_in_contig = 0
                else:
                    program_stop("contig.py")
                if j == 0:
                    contig.seq = each_kmer
                elif j > 0:
                    contig.seq = contig.seq + each_kmer[-1]
            contig_list.append(contig)
            ecs_list.append(-1)