Esempio n. 1
0
def mutate_member(exons, config, params, mut=True):
    leaf_node = {}
    total_mutations = 0
    total_length = 0
    log_string = ""
    for ex_nr, seq in misc_functions.iteritems(exons):
        if mut:
            new_seq, mutation_log, exon_mutations, exon_indels, total_del_length = misc_functions.mutate_sequence(
                seq, config["mut"], config["ins"], config["del"])
            # print(new_seq, mutation_log, exon_mutations)
            leaf_node[ex_nr] = new_seq
        else:
            new_seq = seq
            leaf_node[ex_nr] = seq
            mutation_log, exon_mutations = "-", 0
        log_string += "mutations in exon: {0}, mutation places: {1}\n".format(
            exon_mutations, mutation_log)
        # params.logfile.write("mutations in exon: {0}, mutation places: {1}\n".format(exon_mutations, mutation_log))
        total_mutations += exon_mutations
        total_length += len(new_seq)

    if total_mutations > 0 and mut:
        params.logfile.write(log_string)
        params.logfile.write("Total mutations: {0}\n".format(total_mutations))
        params.logfile.write("mutation rate all exons: {0}\n".format(
            total_mutations / float(total_length)))
        return leaf_node
    elif not mut:
        params.logfile.write(log_string)
        params.logfile.write("Total mutations: {0}\n".format(total_mutations))
        params.logfile.write("mutation rate all exons: {0}\n".format(
            total_mutations / float(total_length)))
        return leaf_node
    else:
        print("NO mutations!")
        return False
Esempio n. 2
0
def main(params):
    # config = misc_functions.read_config(params.config)
    sequence_transcripts = {}
    # for acc, seq in misc_functions.read_fasta(open(params.sequence_material, "r")):
    #     sequence_transcripts[acc] = seq
    sequence_transcripts = dict(
        misc_functions.read_fasta(open(params.sequence_material, "r")))
    # print(sequence_transcripts)
    # read lengths ~ according to P6-C4 chemistry histogram from here
    # http://www.slideshare.net/GenomeInABottle/jan2016-pac-bio-giab   slide 13
    # this looks like it can be well approximated by triangiular distributions with parameters
    # 0 (base start), 10000 (peak), ~45000 (base end)
    # http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.triangular.html
    # pacbios own distribution is here:
    # http://www.pacb.com/blog/new-chemistry-boosts-average-read/

    # read_lengths = np.random.triangular(0, 10000, 45000, config["read_count"])

    # Get average quality based on subread length and the length of the transcript
    # while read count is less than the red count we want:
    #   1. Draw read length from distribution for each read length in simulated read lengths
    #   2. Randomly select a transcript from pool
    #   3. Get average quality based on the number of passes = floor(read_length/transcript_length)
    #       Avg quality is derived from this plot: https://speakerdeck.com/pacbio/specifics-of-smrt-sequencing-data
    #          slide 21, the P4 C2 chemistry, in case of one pass we chose 13 percent error rate from here : http://www.sciencedirect.com/science/article/pii/S1672022915001345.
    #          Out of the errors we follow the data here: http://bib.oxfordjournals.org/content/17/1/154.full.pdf
    #           and here http://www.homolog.us/Tutorials/index.php?p=2.8&s=2
    #           that indicates that for a pacbio genomic read, we have roughly 13-15 percent error rate (older chemistry)
    #           we choose 13. Out of the total errors, we let 11/16 = 68.75 be insertions
    #           4/16= 25% be deletions and 1/16 = 6.25% be substitutions  (given here http://www.homolog.us/Tutorials/index.php?p=2.8&s=2 and http://bib.oxfordjournals.org/content/17/1/154.full.pdf)
    #   4. generate the read

    quality_function = {
        1: 0.87,
        2: 0.95,
        3: 0.957,
        4: 0.969,
        5: 0.981,
        6: 0.985,
        7: 0.99,
        8: 0.992,
        9: 0.994,
        10: 0.995,
        11: 0.995,
        12: 0.995,
        13: 0.996,
        14: 0.996,
        15: 0.996,
        16: 0.999,
        17: 0.999,
        18: 0.999
    }  # passes : quality
    read_count = 1
    # just generate all numbers at once and draw from this 5x should be enough
    it = 0
    lengths = np.random.triangular(0, 10000, 45000, 5 * params.read_count)
    pacbio_reads = {}
    reads_generated_log = defaultdict(int)
    errors = []
    while read_count <= params.read_count:
        if it >= len(lengths):
            lengths = np.random.triangular(0, 10000, 45000,
                                           5 * params.read_count)
            it = 0

        read_len = lengths[it]
        acc = random.choice(list(sequence_transcripts.keys()))
        transcript = sequence_transcripts[acc]
        passes = int(read_len / len(transcript))
        # print(passes, read_len, len(transcript))
        if passes > 0:
            if passes < 18:
                quality = quality_function[passes]
            else:
                quality = 0.999

            subs_rate = (1.0 - quality) * 0.0625
            ins_rate = (1.0 - quality) * 0.6875
            del_rate = (1.0 - quality) * 0.25
            read, error_log, total_error_length, total_indel_length, total_del_length = misc_functions.mutate_sequence(
                transcript, subs_rate, ins_rate, del_rate)
            read_acc = "{0}_read_{1}_error_rate_{2}_total_errors_{3}".format(
                acc, str(read_count),
                total_error_length / float(len(read) + total_del_length),
                total_error_length)
            # params.logfile.write("{0}, error places: {1}\n".format(read_acc, error_log))
            reads_generated_log[acc.split(":copy")[0]] += 1
            errors.append(total_error_length)

            pacbio_reads[read_acc] = read
            read_count += 1

        it += 1

    for acc, abundance in misc_functions.iteritems(reads_generated_log):
        params.logfile.write("{0}\t{1}\n".format(acc, abundance))

    n = float(len(errors))
    mu = sum(errors) / n
    sigma = (sum(list(map(
        (lambda x: x**2 - 2 * x * mu + mu**2), errors))) / (n - 1))**0.5
    min_error = min(errors)
    max_error = max(errors)
    errors.sort()
    if len(errors) % 2 == 0:
        median_error = (errors[int(len(errors) / 2) - 1] +
                        errors[int(len(errors) / 2)]) / 2.0
    else:
        median_error = errors[int(len(errors) / 2)]

    params.logfile.write(
        "mean error: {0}, sd error:{1}, min_error:{2}, max_error:{3}, median_error:{4}\n"
        .format(mu, sigma, min_error, max_error, median_error))

    out_file = open(params.outfile, "w")
    for acc, seq in misc_functions.iteritems(pacbio_reads):
        out_file.write(">{0}\n{1}\n".format(acc, seq))