Ejemplo n.º 1
0
def duplicate_trim_set_with_2nd_set(dict_target_allele_SEQ, dict_fixed_allele_SEQ, ext_flag=True, ext_thrd=0.70, ori_flag=False):
    # if any target_SEQ is subseq of any fixed_SEQ, the target_SEQ is popped
    # if ext_flag is True, when a fixed_SEQ is a subseq of a target_SEQ, target_SEQ is kept and target_name is changed into fixed_name_ext
    # the ext_flag is effective if only len(fixed_SEQ) > len(target_SEQ)*ext_thrd
    dict_trimmed_allele_SEQ = {}
    for (t_name, t_SEQ) in dict_target_allele_SEQ.items():
        assign_name = t_name
        for (f_name, f_SEQ) in dict_fixed_allele_SEQ.items():
            if t_SEQ.upper() in f_SEQ.upper() or t_SEQ.upper() in get_reverse_complement(f_SEQ.upper()): # t_SEQ is the subseq
                assign_name = False
                break
            elif f_SEQ.upper() in t_SEQ.upper() or f_SEQ.upper() in get_reverse_complement(t_SEQ.upper()): # an f_SEQ is the subseq
                if ext_flag == True and len(f_SEQ) >= len(t_SEQ)*ext_thrd:
                    ext_num = 0
                    assign_name = assign_new_name(f_name, '/extend-', dict_trimmed_allele_SEQ)
                else:
                    assign_name = False
        if assign_name:
            if ori_flag == True:
                dict_trimmed_allele_SEQ[t_name] = t_SEQ
            else:
                if 'extend' in assign_name:
                    dict_trimmed_allele_SEQ[assign_name] = t_SEQ
                else:
                    assign_name = assign_new_name(assign_name, '/novel-', dict_trimmed_allele_SEQ)
                    dict_trimmed_allele_SEQ[assign_name] = t_SEQ
    return dict_trimmed_allele_SEQ
Ejemplo n.º 2
0
def correct_allele(dict_occupied_place, dict_SEQ, dict_corrected_alleles,
                   dict_flanking_alleles, dict_contig, len_extend):
    # dict_corrected_alleles {}
    #  - keys: allele_name
    #  - values: corrected_SEQ_set {corrected_SEQ_1, corrected_SEQ_2}

    # dict_flanking_alleles {}
    #  - keys: allele_name
    #  - values: corrected_SEQ_set {flanking_SEQ_1, flanking_SEQ_2}
    for contig_name, list_contig in dict_occupied_place.items():
        contig_SEQ = dict_contig[contig_name]
        contig_len = len(contig_SEQ)
        for pairs in list_contig:
            pos_start = pairs[0]
            pos_end = pairs[1]
            allele_name = pairs[3]
            try:
                allele_name = allele_name.split('|')[1]
            except:
                allele_name = allele_name.split()[0]
            flag = pairs[4]
            flanking_SEQ = contig_SEQ[max(0, pos_start - len_extend -
                                          1):min(contig_len, pos_end +
                                                 len_extend - 1)].lower()
            if pairs[2] != 0:  # mismatched alleles but remain in contig
                corrected_SEQ = contig_SEQ[pos_start - 1:pos_end - 1].lower()
                if flag % 32 >= 16:
                    corrected_SEQ = get_reverse_complement(corrected_SEQ)
                    flanking_SEQ = get_reverse_complement(flanking_SEQ)

                if dict_corrected_alleles.get(allele_name):
                    dict_corrected_alleles[allele_name].add(corrected_SEQ)
                else:
                    dict_corrected_alleles[allele_name] = {corrected_SEQ}

                flanking_name = allele_name + "/novel"
                if dict_flanking_alleles.get(flanking_name):
                    dict_flanking_alleles[flanking_name].add(flanking_SEQ)
                else:
                    dict_flanking_alleles[flanking_name] = {flanking_SEQ}
            else:
                if flag % 32 >= 16:
                    flanking_SEQ = get_reverse_complement(flanking_SEQ)
                flanking_name = allele_name
                if dict_flanking_alleles.get(flanking_name):
                    dict_flanking_alleles[flanking_name].add(flanking_SEQ)
                else:
                    dict_flanking_alleles[flanking_name] = {flanking_SEQ}

    return dict_corrected_alleles, dict_flanking_alleles
Ejemplo n.º 3
0
def get_joint_entropy_profile_per_sequence(seq, w, alias, out=None):
    """
    sliding window entropy profile of all sequences in a family
    :param fasta: a fasta file contatining viral sequences
    :param w: the window size
    :param out: optional. if != None a profile will be saved as a png
    :return: the vector of profile entropy
    """
    all_entropies = {}

    entropies = []
    # get identifier and genomic sequence
    genome = seq

    for j in range(len(genome) - w):
        sub_genome = genome[j:j + w]
        try:
            rc_sub_genome = get_reverse_complement(sub_genome)
            entropy = joint_entropy(sub_genome, rc_sub_genome, 5)
            entropies.append(entropy)
        except:
            break

    df = pd.DataFrame({'{}'.format(alias): entropies})
    if out != None:
        df.to_csv(os.path.join(out, '{}_profile.csv'.format(alias)),
                  index=False)

    return df
Ejemplo n.º 4
0
def get_joint_entropy_profile(fasta, w, out=None):
    """
    sliding window entropy profile of all sequences in a family
    :param fasta: a fasta file contatining viral sequences
    :param w: the window size
    :param out: optional. if != None a profile will be saved as a png
    :return: the vector of profile entropy
    """
    all_entropies = {}
    alias = os.path.basename(fasta).split('.')[0]

    i = 0
    for rec in SeqIO.parse(fasta, "fasta"):
        entropies = []
        # get identifier and genomic sequence

        genome = str(rec.seq)

        for j in range(len(genome) - w):
            sub_genome = genome[j:j + w]
            rc_sub_genome = str(get_reverse_complement(sub_genome))
            entropy = joint_entropy(sub_genome, rc_sub_genome, 5)
            entropies.append(entropy)

        print('Done with seq {}'.format(i))
        all_entropies['seq_{}'.format(i)] = entropies
        i += 1

    df = pd.DataFrame(
        dict([(k, pd.Series(v)) for k, v in all_entropies.items()]))

    df.to_csv(os.path.join(out, '{}_Joint_profile.csv'.format(alias)),
              index=False)

    return df
Ejemplo n.º 5
0
def simulate_genome_by_composition(p, n, size, mode):
    """
    simulate genomes of changing nucleotide compositions
    :param p: the proportions of each character
    :param n: length of simulated sequence
    :param size: number of sequences to simulate
    :param mode: the mode of simulation: 1= no structure, 2= structure
    :return: sequences and corresponding names
    """

    sequences = []
    names = []
    if mode == 1:
        # repetitive sequences
        for i in tqdm(range(size)):
            seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=p, size=n))
            sequences.append(seq)
            names.append('mode_{}_seq_{}'.format(mode,i))

    else:
        # only structure - generate a perfect stem loop
        for i in tqdm(range(size)):
            seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=p, size=n // 2))
            seq = seq + 'aaaaaa' + str(get_reverse_complement(seq))
            sequences.append(seq)
            names.append('mode_{}_seq_{}'.format(mode, i))

    return sequences, names
Ejemplo n.º 6
0
def get_SEQ_from_sam_list(list_fields, dict_SEQ):
    for fields in list_fields:
        if fields[9] != '*':
            name = fields[0]
            flag = int(fields[1])
            if dict_SEQ.get(name):
                continue
            else:
                if flag % 32 >= 16:  # SEQ is reverse_complemented
                    dict_SEQ[name] = get_reverse_complement(fields[9])
                else:  # SEQ is original one
                    dict_SEQ[name] = fields[9]
Ejemplo n.º 7
0
def simulate_genome_by_drops(size, w, genome_size=5000):
    """
    simulate genomes of changing nucleotide compositions
    :param n: length of simulated sequence
    :param size: number of sequences to simulate
    :param w: the drop size
    :return: sequences and corresponding names
    """

    metagenome = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.25,0.25,0.25,0.25], size=genome_size))

    sequences = []
    names = []

    # simulate size genomes
    for i in tqdm(range(size)):
        # drops 1 - homogeneous sequence
        drop_1 = np.random.choice(['a', 'c', 'g', 't']) * w

        # drop 2 - repetitive and structure
        letter1 = np.random.choice(['a', 'c', 'g', 't'])
        letter2 = np.random.choice([x for x in ['a', 'c', 'g', 't'] if x != letter1])   # we want a different nuc.

        drop_2 = letter1 * w//2 + letter2 * w//2

        #drop 3 - pure structure
        stem_arm = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.25,0.25,0.25,0.25], size=w//2 - 5))
        loop = np.random.choice(['a', 'c', 'g', 't']) * 10  # loop of 10 nuc.

        drop_3 = stem_arm + loop + str(get_reverse_complement(stem_arm))

        # drop 4 - bias in nucleotide composition
        nucs = ['a', 'c', 'g', 't']
        np.random.shuffle(nucs)
        drop_4 = ''.join(np.random.choice(nucs, p=[0.6,0.2,0.1,0.1], size=w))


        # insert genomes to metagenome and save the indices.
        simulated_genome = metagenome[:1000] + drop_1 + metagenome[1000:2000] + drop_2 +\
            metagenome[2000:3000] + drop_3 + metagenome[3000:4000] + drop_4 + metagenome[4000:]

        sequences.append(simulated_genome)
        names.append('seq_{}'.format(i))

    return sequences, names
Ejemplo n.º 8
0
    dict_o_allele_SEQ = parse_fasta(fn_original_alleles)
    dict_c_allele_SEQ = parse_fasta(fn_corrected_alleles)

    dict_ref_trimmed_allele_SEQ = duplicate_trim_set_with_2nd_set(dict_c_allele_SEQ, dict_o_allele_SEQ)
    dict_shrink = {}
    for name, SEQ in dict_ref_trimmed_allele_SEQ.items():
        dict_shrink[name+'_prefix'] = SEQ[:-1]
        dict_shrink[name+'_suffix'] = SEQ[1:]
    dict_self_trimmed_allele_SEQ = duplicate_trim_set_with_2nd_set(dict_ref_trimmed_allele_SEQ, dict_shrink, ext_flag=True, ext_thrd=0, ori_flag=True)
    set_SEQ = set()
    for name, SEQ in sorted(dict_self_trimmed_allele_SEQ.items()):
        if SEQ.upper() in set_SEQ:
            dict_self_trimmed_allele_SEQ.pop(name)
        else:
            set_SEQ.add(SEQ.upper())
            set_SEQ.add(get_reverse_complement(SEQ.upper()))

    f_of = open(fo_filtered_alleles, 'w')
    f_oe = open(fo_extended_alleles, 'w')
    for allele_name in sorted(dict_self_trimmed_allele_SEQ.keys()):
        if 'extend' in allele_name:
            f_oe.write(">" + allele_name + '\n')
            f_oe.write(dict_self_trimmed_allele_SEQ[allele_name] + '\n')
        else:
            f_of.write(">" + allele_name + '\n')
            f_of.write(dict_self_trimmed_allele_SEQ[allele_name] + '\n')
    f_of.close()
    f_oe.close()


Ejemplo n.º 9
0
def get_kmers_distribution(fasta, k, out=None):
    """
    get the kmers distribution plot for each family separately
    :param fasta: fasta file
    :param k: the kmer length
    :return: saves the plot
    """

    alias = os.path.basename(fasta).split('.')[0]
    all_values = []

    for rec in SeqIO.parse(fasta, "fasta"):
        # get identifier and genomic sequence
        genome = rec.seq
        rc_genome = str(get_reverse_complement(genome))

        kmers_1 = {}
        kmers_2 = {}

        if k == 5:
            # sliding window of k
            for i in range(len(genome) - k):
                kmer = genome[i:i + k]
                if kmer in kmers_1:
                    kmers_1[kmer] += 1
                else:
                    kmers_1[kmer] = 1

            # for i in range(len(rc_genome) - k):
            #     kmer = rc_genome[i:i+k]
            #     if kmer in kmers_2:
            #         kmers_2[kmer] += 1
            #     else:
            #         kmers_2[kmer] = 1

        elif k == 3:
            # reading frame
            for i in range(0, len(genome) - 3, 3):
                kmer = genome[i:i + 3]
                if kmer in kmers_1:
                    kmers_1[kmer] += 1
                else:
                    kmers_1[kmer] = 1

            # for i in range(0, len(rc_genome) - 3, 3):
            #     kmer = rc_genome[i:i + 3]
            #     if kmer in kmers_2:
            #         kmers_2[kmer] += 1
            #     else:
            #         kmers_2[kmer] = 1
        else:
            assert (k == 1)
            codon_trimmed = string_by_codon_position(genome, 2)
            # rc_codon_trimmed = get_reverse_complement(seq)
            for i in range(len(codon_trimmed)):
                kmer = codon_trimmed[i]
                if kmer in kmers_1:
                    kmers_1[kmer] += 1
                else:
                    kmers_1[kmer] = 1

            # for i in range(len(rc_codon_trimmed)):
            #     kmer = rc_codon_trimmed[i]
            #     if kmer in kmers_2:
            #         kmers_2[kmer] += 1
            #     else:
            #         kmers_2[kmer] = 1

        # create one dictionary for all kmers
        all_kmers = {
            x: kmers_1.get(x, 0) + kmers_2.get(x, 0)
            for x in set(kmers_1) | set(kmers_2)
        }
        values = [int(x) for x in all_kmers.values()]
        all_values.append(all_kmers)
        if out != None:
            # sns.distplot(values, hist=False, kde_kws={'shade':True})
            plt.hist(values, alpha=0.8, normed=True)

    if out != None:
        plt.title('Distribution of kmers {}'.format(alias), fontsize=18)
        plt.xlabel('# kmers appearence', fontsize=18)
        plt.ylabel('Count', fontsize=18)
        sns.despine(offset=10)
        plt.savefig(os.path.join(
            out, '{}_kmers_distribution_hist_normed.png'.format(alias)),
                    format='png',
                    dpi=400,
                    bbox_inches='tight')
        plt.gcf().clear()

    return all_values
Ejemplo n.º 10
0
def parse_edit_distance(fn_sam,
                        fn_output_file,
                        fn_output_flanking_region,
                        fn_output_flanking_size,
                        dict_contig,
                        cluster_id,
                        thrsd=0,
                        flanking_size=100):
    f_report = open(fn_output_file, 'a')
    f_flank = open(fn_output_flanking_region, 'a')
    f_flank_size = open(fn_output_flanking_size, 'a')
    with open(fn_sam, 'r') as f_o:
        for line in f_o:
            if line[0] != '@':  # real alignment information
                fields = line.split()
                #print(fields[11])
                eDist = int(fields[11].split(':')[2])
                cigar = fields[5]
                if 'S' in cigar:
                    continue
                contig_name = fields[2]
                if contig_name != '*' and eDist <= thrsd:
                    print_word = fields[0] + ' ' + contig_name.split('_')[
                        2] + ' ' + contig_name + ' ' + str(cluster_id) + '\n'
                    #print_word = fields[0] + '\t' + fields[2] + '\t' + fields[11]
                    f_report.write(print_word)

                    if dict_contig.get(contig_name):
                        contig_SEQ = dict_contig[contig_name]
                        allele_name = fields[0]
                        allele_print = allele_name + '_cluster_' + cluster_id
                        f_flank.write('>' + allele_print + '\n')
                        if (int(fields[1]) % 32) >= 16:
                            f_flank.write(
                                get_reverse_complement(contig_SEQ) + '\n')
                            print(
                                str(
                                    len(contig_SEQ) - int(fields[3]) -
                                    len(fields[9]) + 1) + '-' +
                                str(len(contig_SEQ) - int(fields[3]) + 1) +
                                ',' + allele_print)
                        else:
                            f_flank.write(contig_SEQ + '\n')
                            print(
                                str(int(fields[3]) - 1) + '-' +
                                str(int(fields[3]) - 1 + len(fields[9])) +
                                ',' + allele_print)

                        start_pos = int(fields[3]) - 1 - flanking_size
                        end_pos = int(fields[3]) - 1 + len(
                            fields[9]) + flanking_size
                        #print(str(start_pos) + '-' + str(end_pos))
                        if start_pos < 0:
                            start_pos = 0
                        if end_pos > len(contig_SEQ):
                            end_pos = len(contig_SEQ)
                        f_flank_size.write('>' + allele_print + '\n')
                        f_flank_size.write(contig_SEQ[start_pos:end_pos] +
                                           '\n')
                    else:
                        eprint("Warning! Contig name does not exist! " +
                               contig_name)
    f_report.close()
    f_flank.close()
    f_flank_size.close()
Ejemplo n.º 11
0
def simulate_dataset(n, size):
    """
    simulate sequences from different classes ( up to 4 : repetitive, repetitive with stem loops, random, only stem loop
    :param n: number of sequences to simulate
    :param size: the size of each sequence
    :return: a data frame containing a sequence, entropy and joint entropy, together with a type indicating the class
    """

    sequences = []
    cluster = []

    # repetitive sequences
    for i in tqdm(range(n)):
        cluster_name = 'Repetitive'
        seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.6, 0.2, 0.1, 0.1], size=size))
        sequences.append(seq)
        cluster.append(cluster_name)

    # create a data frame with all information
    df_rep = pd.DataFrame({'sequence':sequences, 'cluster':cluster})
    df_rep['entropy'] = df_rep['sequence'].apply(lambda x: entropy_by_kmer(x,5))
    df_rep['joint_entropy'] = df_rep['sequence'].apply(lambda x: joint_entropy(x, str(get_reverse_complement(x)), 5))

    # normalize bpth entropy and joint entropy to 0-1
    df_rep['entropy'] = df_rep['entropy'] /  df_rep['entropy'].max()
    df_rep['joint_entropy'] = df_rep['joint_entropy'] / df_rep['joint_entropy'].max()


    sequences = []
    cluster = []
    # repetitive sequences + structure - generate a perfect stem loop
    for i in tqdm(range(n)):
        cluster_name = 'Repetitive + Stem loop'
        seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.6, 0.2, 0.1, 0.1], size=size//2))
        seq = seq + str(get_reverse_complement(seq))
        sequences.append(seq)
        cluster.append(cluster_name)

    # create a data frame with all information
    df_rep_st = pd.DataFrame({'sequence':sequences, 'cluster':cluster})
    df_rep_st['entropy'] = df_rep_st['sequence'].apply(lambda x: entropy_by_kmer(x,5))
    df_rep_st['joint_entropy'] = df_rep_st['sequence'].apply(lambda x: joint_entropy(x, str(get_reverse_complement(x)), 5))

    # normalize bpth entropy and joint entropy to 0-1
    df_rep_st['entropy'] = df_rep_st['entropy'] /  df_rep_st['entropy'].max()
    df_rep_st['joint_entropy'] = df_rep_st['joint_entropy'] / df_rep_st['joint_entropy'].max()

    sequences = []
    cluster = []
    # only structure - generate a perfect stem loop
    for i in tqdm(range(n)):
        cluster_name = 'Stem loop'
        seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.25, 0.25, 0.25, 0.25], size=size//2))
        seq = seq + str(get_reverse_complement(seq))
        sequences.append(seq)
        cluster.append(cluster_name)

    # create a data frame with all information
    df_st = pd.DataFrame({'sequence':sequences, 'cluster':cluster})
    df_st['entropy'] = df_st['sequence'].apply(lambda x: entropy_by_kmer(x,5))
    df_st['joint_entropy'] = df_st['sequence'].apply(lambda x: joint_entropy(x, str(get_reverse_complement(x)), 5))

    # normalize bpth entropy and joint entropy to 0-1
    df_st['entropy'] = df_st['entropy'] /  df_st['entropy'].max()
    df_st['joint_entropy'] = df_st['joint_entropy'] / df_st['joint_entropy'].max()

    sequences = []
    cluster = []

    # random
    for i in tqdm(range(n)):
        cluster_name = 'Random'
        seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.25, 0.25, 0.25, 0.25], size=size))
        seq = seq + str(get_reverse_complement(seq))
        sequences.append(seq)
        cluster.append(cluster_name)

    # create a data frame with all information
    df_rand = pd.DataFrame({'sequence': sequences, 'cluster': cluster})
    df_rand['entropy'] = df_rand['sequence'].apply(lambda x: entropy_by_kmer(x, 5))
    df_rand['joint_entropy'] = df_rand['sequence'].apply(lambda x: joint_entropy(x, str(get_reverse_complement(x)), 5))

    # normalize bpth entropy and joint entropy to 0-1
    df_rand['entropy'] = df_rand['entropy'] /  df_rand['entropy'].max()
    df_rand['joint_entropy'] = df_rand['joint_entropy'] / df_rand['joint_entropy'].max()


    # combine all inputs to one df, and return it

    result = pd.concat([df_rep, df_rep_st, df_st, df_rand])
    return result
Ejemplo n.º 12
0
def process_seqs_for_grep(list_seqs):
    list_rc = []
    for seq in list_seqs:
        list_rc.append(get_reverse_complement(seq))
    set_all = set(list_seqs).union(set(list_rc))
    return set_all
Ejemplo n.º 13
0
            if dict_contig_H1.get(contig_name):
                contig_SEQ = dict_contig_H1[contig_name]
            elif dict_contig_H2.get(contig_name):
                contig_SEQ = dict_contig_H2[contig_name]
            else:
                eprint("Fatal Error! contig name " + contig_name +
                       " not found!")
        else:
            contig_SEQ = dict_contig_H1[contig_name]
        left_flank = max(0, start_pos - len_extend - 1)
        right_flank = min(len(contig_SEQ), end_pos + len_extend - 1)
        flanking_SEQ = contig_SEQ[left_flank:right_flank]
        if len(annotation_info) > 4:  # with mismatch
            allele_name = allele_name + "/novel"
        if dict_flank_SEQ.get(allele_name):
            if flanking_SEQ in dict_flank_SEQ[
                    allele_name] or get_reverse_complement(
                        flanking_SEQ) in dict_flank_SEQ[allele_name]:
                pass
            else:
                dict_flank_SEQ[allele_name].add(flanking_SEQ)
        else:
            dict_flank_SEQ[allele_name] = {flanking_SEQ}

    f_of = open(fo_asm_flanking, 'w')
    for allele_name, set_allele_SEQ in sorted(dict_flank_SEQ.items()):
        for idx, allele_SEQ in enumerate(sorted(set_allele_SEQ)):
            f_of.write('>' + allele_name + '-' + str(idx) + '\n')
            f_of.write(allele_SEQ.lower() + '\n')
    f_of.close()
Ejemplo n.º 14
0
                    #Find the same name in novel allele reference database
                    for ref_SEQ in list_novel_SEQ:
                        if ref_SEQ in SEQ:
                            novel_allele_name = dict_novel_serial[ref_SEQ]
                            novel_allele_name += '/f'
                else:
                    novel_allele_name = allele_name[:allele_name.rfind('-')]
                    novel_allele_name += '/f'
            else:
                print("WARNING! Incorrect naming in file", person_name)
            SEQ = SEQ.lower()
            if dict_database.get(novel_allele_name):
                dict_SEQ = dict_database[novel_allele_name]
                if dict_SEQ.get(SEQ):
                    dict_SEQ[SEQ].append(person_name)
                elif dict_SEQ.get(get_reverse_complement(SEQ)):
                    dict_SEQ[get_reverse_complement(SEQ)].append(person_name)
                else:  # add the SEQ into dict_SEQ
                    dict_SEQ[SEQ] = [person_name]
            else:
                dict_database[novel_allele_name] = {SEQ: [person_name]}

    f_of = open(fo_merged_fasta, 'w')
    f_or = open(fo_merged_report, 'w')
    f_or.write(
        'allele_name\tnumber_of_found_in_database\tsamples_possessing_the_allele\n'
    )
    for allele_name, dict_SEQ in sorted(dict_database.items()):
        for idx, (SEQ, list_person) in enumerate(
                sorted(dict_SEQ.items(),
                       key=lambda pair: len(pair[1]),
Ejemplo n.º 15
0
def mark_edit_region(fn_sam, fn_output_file, contig_file):
    edit_histogram = None
    cov_histogram = None
    #list_read_info: [ (start_pos, end_pos, read_name, even_odd_flag, mis_region) ]
    list_read_info = []
    contig_len = 0
    contig_name = ""
    # dict_reads{}
    #  - key: (read_name, pair_number)
    #  - values: read_SEQ
    dict_reads = {}
    even_odd_flag = 1
    with open(fn_sam, 'r') as f_s:
        for line in f_s:
            if line[0] == '@':  # header, information of the contig
                if line.find('LN:') != -1:
                    # sometimes SPAdes would produce more than 1 contig, but the short one are not very useful
                    # so we discard the short contigs and reads align to them
                    if contig_len == 0:
                        contig_len = int(
                            line[line.find('LN:') +
                                 3:-1]) + 1  # the number system start with 1
                        contig_name = line.split(':')[1][:-3]
                        edit_histogram = np.zeros(contig_len)
                        cov_histogram = np.zeros(contig_len)
            else:  # real alignment information
                fields = line.split()
                # if the read align to shorter contigs, pass
                if contig_name != fields[2]:
                    dict_reads[(read_name, even_odd_flag)] = read_SEQ
                    list_read_info.append(
                        (0, 0, read_name, even_odd_flag, [], "", read_SEQ))
                    if even_odd_flag == 1:
                        even_odd_flag = 2
                    else:
                        even_odd_flag = 1
                    continue
                read_name = fields[0]
                read_SEQ = fields[9]
                cigar = fields[5]
                sam_flag = int(fields[1])
                # if the alignment is a supplementary alignment, pass
                # read BWA manual "Supplementary Alignment" for more information
                if sam_flag > 1024:
                    continue
                # if cigar == '*', means alignment is bad, pass
                if cigar == '*':
                    dict_reads[(read_name, even_odd_flag)] = read_SEQ
                    #list_read_info.append((start_pos, end_pos, read_name, even_odd_flag, mis_region))
                    list_read_info.append(
                        (0, 0, read_name, even_odd_flag, [], "", read_SEQ))
                    if even_odd_flag == 1:
                        even_odd_flag = 2
                    else:
                        even_odd_flag = 1
                    continue

                edit_dist = int(fields[11].split(':')[2])
                MD_tag = fields[12].split(':')[2]
                start_pos = int(fields[3])

                number, operate = parse_CIGAR(cigar)
                mis_region_MD = parse_MD(MD_tag)
                #if operate[0] == 'S':
                #    mis_region_MD = [ele + number[0] + start_pos - 1 for ele in mis_region_MD]
                #else:
                mis_region_MD = [ele + start_pos - 1 for ele in mis_region_MD]

                mis_region_I = []  # insertion boundary region
                diff_len = 0  # len contribution of D and I
                if 'I' in operate or 'D' in operate:
                    idx_I = start_pos - 1  # index in reference
                    for idx, op in enumerate(operate):
                        if op == 'I':
                            diff_len -= number[idx]
                            mis_region_I.append(idx_I)
                            mis_region_I.append(idx_I + 1)
                        else:
                            if op == 'S':
                                diff_len -= number[idx]
                            else:
                                idx_I += number[idx]
                                if op == 'D':
                                    diff_len += number[idx]

                #print(fields[0])
                #print(mis_region_MD)
                #print(mis_region_I)
                #print(mis_region)
                mis_region = mis_region_MD + mis_region_I
                mis_region.sort()

                edit_histogram[mis_region] += 1

                end_pos = start_pos + len(fields[9]) + diff_len
                cov_histogram[start_pos:end_pos] += 1

                # record the reads information
                if int(sam_flag / 16) % 2 == 1:
                    dict_reads[(read_name,
                                even_odd_flag)] = get_reverse_complement(
                                    read_SEQ.upper())
                else:
                    dict_reads[(read_name, even_odd_flag)] = read_SEQ
                list_read_info.append(
                    (start_pos, end_pos, read_name, even_odd_flag, mis_region,
                     cigar, read_SEQ))
                if even_odd_flag == 1:
                    even_odd_flag = 2
                else:
                    even_odd_flag = 1

    contig_SEQ = ""
    with open(contig_file, 'r') as f_c:
        contig_flag = False
        for line in f_c:
            if line[0] == '>':
                tmp_name = line[1:].strip()
                if tmp_name == contig_name:
                    contig_flag = True
                else:
                    contig_flag = False
            elif contig_flag:
                contig_SEQ += line.strip()

    return edit_histogram, cov_histogram, list_read_info, dict_reads, contig_SEQ
Ejemplo n.º 16
0
def coverage_analysis(
    dict_read_allele_clusters,
    fn_annotation,
    required_min_depth=0,
    required_single_coverage=50,
    required_single_identity=1,
):

    dict_hc_calls = {}
    dict_sup_reads = {}

    # tmp: for dev
    list_annotated = []
    #f_tmp = open('./NA12878_annotated_all.txt', 'r')
    f_tmp = open(fn_annotation, 'r')
    for line in f_tmp:
        list_annotated.append(line.rstrip())

    # tmp
    list_answer = []

    # for each cluster
    #for cluster_id in dict_read_allele_clusters.keys():
    for cluster_id in range(55, len(dict_read_allele_clusters.keys()), 50):
        print("Cluster: " + str(cluster_id))
        eprint("============= Cluster: " + str(cluster_id) + " ==============")
        cluster = dict_read_allele_clusters[str(cluster_id)]
        dict_allele = cluster[0]
        dict_read = cluster[1]

        # for each allele in a cluster
        for allele in dict_allele.keys():
            print(allele)
            seq_allele = dict_allele[allele]
            seq_coverage = np.zeros(len(seq_allele))
            dict_sup_reads[allele] = set()

            # tmp
            if allele in list_annotated:
                list_answer.append(allele)

            # need simplify
            for read in dict_read:
                seq_read = dict_read[read]
                # ignore the reads that are too short
                if len(seq_read) < required_single_coverage:
                    continue

                traverse_result = hamming_traverse(seq_allele, seq_read,
                                                   required_single_coverage,
                                                   required_single_identity)
                if traverse_result[0]:
                    seq_coverage[traverse_result[1]:traverse_result[2]] += 1
                    dict_sup_reads[allele].add(read)
                else:
                    r_seq_read = get_reverse_complement(seq_read)
                    traverse_result = hamming_traverse(
                        seq_allele, r_seq_read, required_single_coverage,
                        required_single_identity)
                    if traverse_result[0]:
                        seq_coverage[
                            traverse_result[1]:traverse_result[2]] += 1
                        dict_sup_reads[allele].add(read)

            if min(seq_coverage) > required_min_depth:
                if dict_hc_calls.get(allele):
                    print("Warning! evaluate two times")
                    dict_hc_calls[allele] += 1
                else:
                    dict_hc_calls[allele] = min(seq_coverage)
                print("OOO: " + str(min(seq_coverage)) + ' ' +
                      str(sum(seq_coverage) / len(seq_coverage)) + ' ' +
                      str(max(seq_coverage)))
            else:
                print("XXX: " + str(min(seq_coverage)) + ' ' +
                      str(sum(seq_coverage) / len(seq_coverage)) + ' ' +
                      str(max(seq_coverage)))
            print(seq_coverage)

    print(dict_hc_calls)
    print(list_answer)

    # tmp
    print('Num. high-confidence calls')
    print(len(set(dict_hc_calls)))
    print('Num. answer')
    print(len(set(list_answer)))
    print('Num. intersection')
    print(len(set(list_answer).intersection(set(dict_hc_calls))))

    #print ("Support reads of alleles:")
    #print (dict_sup_reads)

    return dict_sup_reads