Esempio n. 1
0
def likelihood_subsample(taxon, treatment, ntot_subsample=50, fmax_cutoff=0.8, fmin_cutoff=0.0, subsamples=10000):
    # ntot_subsample minimum number of mutations

    # Load convergence matrix
    convergence_matrix = parse_file.parse_convergence_matrix(pt.get_path() + '/data/timecourse_final/' +("%s_convergence_matrix.txt" % (treatment+taxon)))

    populations = [treatment+taxon + replicate for replicate in pt.replicates ]

    gene_parallelism_statistics = mutation_spectrum_utils.calculate_parallelism_statistics(convergence_matrix,populations, fmax_min=fmax_cutoff)

    G_subsample_list = []
    for i in range(subsamples):

        G_subsample = mutation_spectrum_utils.calculate_subsampled_total_parallelism(gene_parallelism_statistics, ntot_subsample=ntot_subsample)

        G_subsample_list.append(G_subsample)

    G_subsample_list.sort()

    G_CIs_dict = {}

    G_subsample_mean = np.mean(G_subsample_list)
    G_subsample_025 = G_subsample_list[ int( 0.025 * subsamples)  ]
    G_subsample_975 = G_subsample_list[ int( 0.975 * subsamples)  ]

    G_CIs_dict['G_mean'] = G_subsample_mean
    G_CIs_dict['G_025'] = G_subsample_025
    G_CIs_dict['G_975'] = G_subsample_975

    return G_CIs_dict
def parse_reference_genome(taxon):
    filename= pt.get_path() + '/' + pt.get_ref_gbff_dict(taxon)

    reference_sequences = []

    # GBK file
    if filename[-3:] == 'gbk':
        file = open(filename,"r")
        origin_reached = False
        for line in file:
            if line.startswith("ORIGIN"):
                origin_reached=True
            if origin_reached:
                items = line.split()
                if items[0].isdigit():
                    reference_sequences.extend(items[1:])
        file.close()

    # FASTA file
    else:
        file = open(filename,"r")
        file.readline() # header
        for line in file:
            reference_sequences.append(line.strip())
        file.close()

    reference_sequence = "".join(reference_sequences).upper()
    return reference_sequence
def parse_simulation_output():

    saved_data_file = '%s/data/simulations/test.dat' % (pt.get_path())
    sampled_timepoints = pickle.load(open(saved_data_file, "rb"))

    allele_freq_trajectory_dict = {}

    for key, value in sampled_timepoints.items():

        #print(value)

        N = sum([value[x]['n_clone_active'] for x in value.keys()])
        M = sum([value[x]['n_clone_dormant'] for x in value.keys()])

        print(N, M)
Esempio n. 4
0
def calculate_likelihood_ratio_fmax(taxon,
                                    treatment,
                                    ntot_subsample=50,
                                    fmax_partition=0.8,
                                    subsamples=10000):

    convergence_matrix = parse_file.parse_convergence_matrix(
        pt.get_path() + '/data/timecourse_final/' +
        ("%s_convergence_matrix.txt" % (treatment + taxon)))

    populations = [
        treatment + taxon + replicate for replicate in pt.replicates
    ]

    gene_parallelism_statistics = mutation_spectrum_utils.calculate_parallelism_statistics(
        convergence_matrix, populations, fmax_min=fmax_cutoff)

    G_subsample_list = []
def parse_well_mixed_state_timecourse(population):

    haplotype_filename = pt.get_path() + '/data/timecourse_final/' +('%s_well_mixed_state_timecourse.txt' % population)

    file = open(haplotype_filename,"r")

    times = numpy.array([float(item) for item in file.readline().split(",")])
    num_unborn = numpy.array([float(item) for item in file.readline().split(",")])
    num_extinct = numpy.array([float(item) for item in file.readline().split(",")])
    num_fixed = numpy.array([float(item) for item in file.readline().split(",")])
    num_polymorphic = numpy.array([float(item) for item in file.readline().split(",")])

    states = []
    for line in file:
        Ls = numpy.array([float(item) for item in line.split(",")])
        states.append(Ls)
    file.close()
    return times, states
def parse_annotated_timecourse(population, only_passed=True, min_coverage=5):

    mutations = []

    timecourse_filename =  pt.get_path() + '/data/timecourse_final/' +("%s_annotated_timecourse.txt" % population)

    file = open(timecourse_filename, "r")

    header_line = file.readline()
    items = header_line.strip().split(",")

    times = []
    # 13
    for i in range(16,len(items),2):
        times.append(int(items[i].split(":")[1]))
    times = numpy.array(times)

    # depth line
    depth_line = file.readline()
    items = depth_line.strip().split(",")
    avg_depths = []
    for i in range(16,len(items),2):
        avg_depths.append(float(items[i+1]))
    avg_depths = numpy.array(avg_depths)
    population_avg_depth_times = times[times<1000000]
    population_avg_depths = avg_depths[times<1000000]
    clone_avg_depth_times = times[times>1000000]-1000000
    clone_avg_depths = avg_depths[times>1000000]

    for line in file:
        items = line.strip().split(",")
        location = int(items[0])
        gene_name = items[1].strip()
        allele = items[2].strip()
        var_type = items[3].strip()

        codon = items[4].strip()
        position_in_codon = items[5].strip()
        if (position_in_codon != 'None') and (position_in_codon != 'unknown'):
            position_in_codon = int(position_in_codon)
        fold_count = items[6].strip()
        if (fold_count != 'None') and (fold_count != 'unknown'):
            fold_count = int(fold_count)

        test_statistic = float(items[7])
        pvalue = float(items[8])
        cutoff_idx = int(items[9])
        depth_fold_change = float(items[10])
        depth_change_pvalue = float(items[11])

        duplication_idx = int(items[12])
        fold_increase = float(items[13])
        duplication_pvalue = float(items[14])

        passed_str = items[15]
        if passed_str.strip()=='PASS':
            passed = True
        else:
            passed = False

        alts = []
        depths = []

        for i in range(16,len(items),2):
            alts.append(int(float(items[i])))
            depths.append(int(float(items[i+1])))

        alts = numpy.array(alts)
        depths = numpy.array(depths)

        # zero out timepoints with individual coverage lower than some threshold
        alts *= (depths>=min_coverage)*(avg_depths>=min_coverage)
        depths *= (depths>=min_coverage)*(avg_depths>=min_coverage)

        pop_times = times[(times<1000000)]
        pop_alts = alts[(times<1000000)]
        pop_depths = depths[(times<1000000)]


        clone_times = times[(times>1000000)]-1000000
        clone_alts = alts[(times>1000000)]
        clone_depths = depths[(times>1000000)]

        if passed or (not only_passed):
            mutations.append((location, gene_name, allele, var_type, codon, position_in_codon, fold_count, test_statistic, pvalue, cutoff_idx, depth_fold_change, depth_change_pvalue, pop_times, pop_alts, pop_depths, clone_times, clone_alts, clone_depths))

    file.close()
    #print(mutations[0])
    # sort by position
    keys = [mutation[0] for mutation in mutations]
    keys, mutations = (list(t) for t in zip(*sorted(zip(keys, mutations))))
    return mutations, (population_avg_depth_times, population_avg_depths, clone_avg_depth_times, clone_avg_depths)
def parse_gene_list(taxon, reference_sequence=None):
    gene_names = []
    start_positions = []
    end_positions = []
    promoter_start_positions = []
    promoter_end_positions = []
    gene_sequences = []
    strands = []
    genes = []
    features = []
    protein_ids = []

    filename= pt.get_path() + '/' + pt.get_ref_gbff_dict(taxon)
    gene_features = ['CDS', 'tRNA', 'rRNA', 'ncRNA', 'tmRNA']
    recs = [rec for rec in SeqIO.parse(filename, "genbank")]
    count_riboswitch = 0
    for rec in recs:
        reference_sequence = rec.seq
        contig = rec.annotations['accessions'][0]
        for feat in rec.features:
            if 'pseudo' in list((feat.qualifiers.keys())):
                continue
            if (feat.type == "source") or (feat.type == "gene"):
                continue

            locations = re.findall(r"[\w']+", str(feat.location))
            if feat.type in gene_features:
                locus_tag = feat.qualifiers['locus_tag'][0]
            elif (feat.type=="regulatory"):
                locus_tag = feat.qualifiers["regulatory_class"][0] + '_' + str(count_riboswitch)
                count_riboswitch += 1
            else:
                continue
            # for frameshifts, split each CDS seperately and merge later
            # Fix this for Deinococcus, it has a frameshift in three pieces
            split_list = []
            if 'join' in locations:
                location_str = str(feat.location)
                minus_position = []
                if '-' in location_str:
                    minus_position = [r.start() for r in re.finditer('-', location_str)]
                pos_position = []

                if '+' in location_str:
                    if taxon == 'D':
                        pos_position = [pos for pos, char in enumerate(location_str) if char == '+']
                    elif taxon == 'J':
                        pos_position = [pos for pos, char in enumerate(location_str) if char == '+']
                    else:
                        pos_position = [r.start() for r in re.finditer('+', location_str)]


                if len(minus_position) + len(pos_position) == 2:
                    if len(minus_position) == 2:
                        strand_symbol_one = '-'
                        strand_symbol_two = '-'
                    elif len(pos_position) == 2:
                        strand_symbol_one = '+'
                        strand_symbol_two = '+'
                    else:
                        # I don't think this is possible, but might as well code it up
                        if minus_position[0] < pos_position[0]:
                            strand_symbol_one = '-'
                            strand_symbol_two = '+'
                        else:
                            strand_symbol_one = '+'
                            strand_symbol_two = '-'

                    start_one = int(locations[1])
                    stop_one = int(locations[2])
                    start_two = int(locations[3])
                    stop_two = int(locations[4])

                    locus_tag1 = locus_tag + '_1'
                    locus_tag2 = locus_tag + '_2'

                    split_list.append([locus_tag1, start_one, stop_one, strand_symbol_one])
                    split_list.append([locus_tag2, start_two, stop_two, strand_symbol_two])

                else:
                    if len(pos_position) == 3:
                        strand_symbol_one = '+'
                        strand_symbol_two = '+'
                        strand_symbol_three = '+'
                    start_one = int(locations[1])
                    stop_one = int(locations[2])
                    start_two = int(locations[3])
                    stop_two = int(locations[4])
                    start_three = int(locations[5])
                    stop_three = int(locations[6])

                    locus_tag1 = locus_tag + '_1'
                    locus_tag2 = locus_tag + '_2'
                    locus_tag3 = locus_tag + '_3'

                    split_list.append([locus_tag1, start_one, stop_one, strand_symbol_one])
                    split_list.append([locus_tag2, start_two, stop_two, strand_symbol_two])
                    split_list.append([locus_tag3, start_three, stop_three, strand_symbol_three])


            else:
                strand_symbol = str(feat.location)[-2]
                start = int(locations[0])
                stop = int(locations[1])
                split_list.append([locus_tag, start, stop, strand_symbol])

            for split_item in split_list:
                locus_tag = split_item[0]
                start = split_item[1]
                stop = split_item[2]
                strand_symbol = split_item[3]


                if feat.type == 'CDS':
                    #  why was a -1 there originally?
                    #gene_sequence = reference_sequence[start-1:stop]
                    gene_sequence = str(reference_sequence[start:stop])
                else:
                    gene_sequence = ""


                if 'gene' in list((feat.qualifiers.keys())):
                    gene = feat.qualifiers['gene'][0]
                else:
                    gene = ""

                if 'protein_id' in list((feat.qualifiers.keys())):
                    protein_id = feat.qualifiers['protein_id'][0]
                else:
                    protein_id = ""


                if strand_symbol == '+':
                    promoter_start = start - 100 # by arbitrary definition, we treat the 100bp upstream as promoters
                    promoter_end = start - 1
                    strand = 'forward'
                else:
                    promoter_start = stop+1
                    promoter_end = stop+100
                    strand = 'reverse'


                if gene_sequence!="" and (not len(gene_sequence)%3==0):
                    print(locus_tag, start, "Not a multiple of 3")
                    continue

                # dont need to check if gene names are unique because we're using
                # locus tags

                start_positions.append(start)
                end_positions.append(stop)
                promoter_start_positions.append(promoter_start)
                promoter_end_positions.append(promoter_end)
                gene_names.append(locus_tag)
                gene_sequences.append(gene_sequence)
                strands.append(strand)
                genes.append(gene)
                features.append(feat.type)
                protein_ids.append(protein_id)

    gene_names, start_positions, end_positions, promoter_start_positions, promoter_end_positions, gene_sequences, strands, genes, features, protein_ids = (list(x) for x in zip(*sorted(zip(gene_names, start_positions, end_positions, promoter_start_positions, promoter_end_positions, gene_sequences, strands, genes, features, protein_ids), key=lambda pair: pair[1])))

    return gene_names, numpy.array(start_positions), numpy.array(end_positions), numpy.array(promoter_start_positions), numpy.array(promoter_end_positions), gene_sequences, strands, genes, features, protein_ids
maple_annotation_dict = {}

kegg_maple_map_all_taxa = {}

treatment_count_dict = {}

for treatment in treatments:

    treatment_count_dict[treatment] = 0

    for taxon in taxa:

        protein_id_kegg_dict = {}

        protein_id_kegg = open(
            pt.get_path() +
            '/data/reference_assemblies_task2/MAPLE/%s_MAPLE_result/query.fst.ko'
            % taxon, 'r')
        # make protein ID => KEGG map
        for i, line in enumerate(protein_id_kegg):
            line = line.strip()
            items = line.split("\t")
            protein_id = items[0]
            if items[1] != 'K_NA':
                protein_id_kegg_dict[items[0]] = items[1]

        significant_genes_path = pt.get_path(
        ) + '/data/timecourse_final/parallel_genes_%s.txt' % (treatment +
                                                              taxon)
        if os.path.exists(significant_genes_path) == False:
            continue
Esempio n. 9
0
import scipy.stats as stats

import parse_file
import timecourse_utils
import mutation_spectrum_utils

np.random.seed(123456789)

treatments = pt.treatments
replicates = pt.replicates

color_range = np.linspace(0.0, 1.0, 10)
rgb_blue = cm.get_cmap('Blues')(color_range)
rgb_red = cm.get_cmap('Reds')(color_range)

path_IN = pt.get_path() + '/data/spore_assay/Sporulation_170912_long.txt'
IN = pd.read_csv(path_IN, sep='\t')
IN = IN.loc[IN['Time_hours'] <= 400]
#d100
IN_0B1_100 = IN.loc[(IN['Pop'] == '0B1') & (IN['Day'] == 100)]
IN_2B1_100 = IN.loc[(IN['Pop'] == '2B1') & (IN['Day'] == 100)]
IN_mean_0B1_100 = IN_0B1_100['Vegetative_percent'].groupby(
    IN_0B1_100['Time_hours']).mean().reset_index()
IN_mean_2B1_100 = IN_2B1_100['Vegetative_percent'].groupby(
    IN_2B1_100['Time_hours']).mean().reset_index()
IN_std_0B1_100 = IN_0B1_100['Vegetative_percent'].groupby(
    IN_0B1_100['Time_hours']).std().reset_index()
IN_std_2B1_100 = IN_2B1_100['Vegetative_percent'].groupby(
    IN_2B1_100['Time_hours']).std().reset_index()
# Day 500
IN_0B1_500 = IN.loc[(IN['Pop'] == '0B1') & (IN['Day'] == 500)]
Esempio n. 10
0
def plot_mutation_trajectory_taxon(taxon):

    if taxon == 'J':
        treatments = ['0', '2']
        sub_plot_labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
        sub_plot_count_step = 2
        dim = (6, 15)
    else:
        treatments = pt.treatments
        sub_plot_labels = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o'
        ]
        sub_plot_count_step = 3
        dim = (10, 15)

    sys.stderr.write("Loading mutation data...\n")

    mutation_trajectories = {}
    fixed_mutation_trajectories = {}
    delta_mutation_trajectories = {}
    #transit_times = {}
    median_trajectories = {}
    n_muts_trajectories = {}

    for treatment in treatments:
        for replicate in pt.replicates:

            population = treatment + taxon + replicate
            if population in pt.populations_to_ignore:
                continue

            sys.stderr.write("Processing %s...\t" % population)

            times, Ms, fixed_Ms = parse_file.get_mutation_fixation_trajectories(
                population)

            times_, medians_log10, num_muts = parse_file.get_mutation_fixation_trajectories_median_freq_and_mut_number(
                population)

            if isinstance(fixed_Ms, float) == True:
                fixed_Ms = np.asarray([0] * len(times))

            fixed_mutation_trajectories[population] = (times, fixed_Ms)
            mutation_trajectories[population] = (times, np.log10(Ms))
            delta_mutation_trajectories[population] = (times[1:],
                                                       np.log10(Ms[1:] /
                                                                Ms[:-1]))

            median_trajectories[population] = (times_, medians_log10)
            n_muts_trajectories[population] = (times_, num_muts)

            sys.stderr.write("analyzed %d mutations!\n" % len(Ms))

    fig = plt.figure(figsize=dim)

    column_count = 0

    for treatment in treatments:

        ax_t_vs_M = plt.subplot2grid((5, len(treatments)), (0, column_count),
                                     colspan=1)
        ax_t_vs_delta_M = plt.subplot2grid((5, len(treatments)),
                                           (1, column_count),
                                           colspan=1)
        ax_t_vs_F = plt.subplot2grid((5, len(treatments)), (2, column_count),
                                     colspan=1)

        ax_t_vs_median_freq = plt.subplot2grid((5, len(treatments)),
                                               (3, column_count),
                                               colspan=1)
        ax_t_vs_number_muts = plt.subplot2grid((5, len(treatments)),
                                               (4, column_count),
                                               colspan=1)

        ax_t_vs_M.text(-0.1,
                       1.07,
                       sub_plot_labels[column_count],
                       fontsize=14,
                       fontweight='bold',
                       ha='center',
                       va='center',
                       transform=ax_t_vs_M.transAxes)
        ax_t_vs_delta_M.text(-0.1,
                             1.07,
                             sub_plot_labels[column_count +
                                             sub_plot_count_step],
                             fontsize=14,
                             fontweight='bold',
                             ha='center',
                             va='center',
                             transform=ax_t_vs_delta_M.transAxes)
        ax_t_vs_F.text(-0.1,
                       1.07,
                       sub_plot_labels[column_count + sub_plot_count_step * 2],
                       fontsize=14,
                       fontweight='bold',
                       ha='center',
                       va='center',
                       transform=ax_t_vs_F.transAxes)
        ax_t_vs_median_freq.text(-0.1,
                                 1.07,
                                 sub_plot_labels[column_count +
                                                 sub_plot_count_step * 3],
                                 fontsize=14,
                                 fontweight='bold',
                                 ha='center',
                                 va='center',
                                 transform=ax_t_vs_median_freq.transAxes)
        ax_t_vs_number_muts.text(-0.1,
                                 1.07,
                                 sub_plot_labels[column_count +
                                                 sub_plot_count_step * 4],
                                 fontsize=14,
                                 fontweight='bold',
                                 ha='center',
                                 va='center',
                                 transform=ax_t_vs_number_muts.transAxes)

        treatment_taxon_populations = []

        all_medians = []
        all_numbers = []

        for replicate in pt.replicates:

            population = treatment + taxon + replicate
            if population in pt.populations_to_ignore:
                continue

            Mts, Ms = mutation_trajectories[population]
            fixed_Mts, fixed_Ms = fixed_mutation_trajectories[population]
            deltaMts, deltaMs = delta_mutation_trajectories[population]

            median_trajectories_ts, median_trajectories_ = median_trajectories[
                population]
            n_muts_trajectories_ts, n_muts_trajectories_ = n_muts_trajectories[
                population]

            ax_t_vs_M.plot(Mts,
                           10**Ms,
                           'o-',
                           color=pt.get_colors(treatment),
                           marker=pt.plot_species_marker(taxon),
                           fillstyle=pt.plot_species_fillstyle(taxon),
                           alpha=1,
                           markersize=7,
                           linewidth=3,
                           markeredgewidth=1.5,
                           zorder=1)
            ax_t_vs_M.set_yscale('log', base=10)
            ax_t_vs_M.tick_params(axis='x', labelsize=8)

            # back transform to format plot axes
            ax_t_vs_delta_M.plot(deltaMts,
                                 10**deltaMs,
                                 color=pt.get_colors(treatment),
                                 marker=pt.plot_species_marker(taxon),
                                 fillstyle=pt.plot_species_fillstyle(taxon))
            ax_t_vs_delta_M.set_yscale('log', base=10)

            ax_t_vs_F.plot(fixed_Mts,
                           fixed_Ms,
                           'o-',
                           color=pt.get_colors(treatment),
                           marker=pt.plot_species_marker(taxon),
                           fillstyle=pt.plot_species_fillstyle(taxon),
                           alpha=1,
                           markersize=7,
                           linewidth=3,
                           markeredgewidth=1.5,
                           zorder=1)
            #ax_M_vs_F.set_xlabel('Days, ' + r'$t$', fontsize = 12)

            ax_t_vs_median_freq.plot(
                median_trajectories_ts,
                10**median_trajectories_,
                'o-',
                color=pt.get_colors(treatment),
                marker=pt.plot_species_marker(taxon),
                fillstyle=pt.plot_species_fillstyle(taxon),
                alpha=1,
                markersize=7,
                linewidth=3,
                markeredgewidth=1.5,
                zorder=1)
            ax_t_vs_median_freq.set_yscale('log', base=10)
            #ax_t_vs_median_freq.tick_params(axis='y', labelsize=6)

            ax_t_vs_median_freq.yaxis.set_tick_params(labelsize=8)

            all_medians.extend(median_trajectories_.tolist())

            ax_t_vs_number_muts.plot(
                n_muts_trajectories_ts,
                n_muts_trajectories_,
                'o-',
                color=pt.get_colors(treatment),
                marker=pt.plot_species_marker(taxon),
                fillstyle=pt.plot_species_fillstyle(taxon),
                alpha=1,
                markersize=7,
                linewidth=3,
                markeredgewidth=1.5,
                zorder=1)
            ax_t_vs_number_muts.set_yscale('log', base=10)

            ax_t_vs_number_muts.tick_params(axis='y', labelsize=8)

            all_numbers.extend(n_muts_trajectories_.tolist())

            treatment_taxon_populations.append(population)

        print(10**(min(all_medians) * 0.8), 10**(max(all_medians) * 1.2))

        ax_t_vs_median_freq.set_ylim(
            [10**(min(all_medians)) * 0.8, 10**(max(all_medians)) * 1.2])

        ax_t_vs_number_muts.set_ylim(
            [min(all_numbers) * 0.8,
             max(all_numbers) * 1.2])

        avg_Mts, avg_Ms = timecourse_utils.average_trajectories([
            mutation_trajectories[population]
            for population in treatment_taxon_populations
        ])

        avg_deltaMts, avg_deltaMs = timecourse_utils.average_trajectories([
            delta_mutation_trajectories[population]
            for population in treatment_taxon_populations
        ])

        ax_t_vs_delta_M.axhline(y=1, c='grey', linestyle=':', lw=3, zorder=1)
        ax_t_vs_M.plot(avg_Mts,
                       10**avg_Ms,
                       '--',
                       color='k',
                       marker=" ",
                       alpha=1,
                       linewidth=4,
                       zorder=2)
        ax_t_vs_delta_M.plot(avg_deltaMts,
                             10**avg_deltaMs,
                             '--',
                             color='k',
                             marker=" ",
                             alpha=1,
                             linewidth=4,
                             zorder=2)

        # keep them on the same y axes
        if taxon == 'C':
            ax_t_vs_delta_M.set_ylim([0.2, 42])
        elif taxon == 'D':
            ax_t_vs_delta_M.set_ylim([0.2, 20])

        if (column_count == 0):
            legend_elements = [
                Line2D([0], [0],
                       ls='--',
                       color='k',
                       lw=1.5,
                       label=r'$\overline{M}(t)$')
            ]
            ax_t_vs_M.legend(handles=legend_elements,
                             loc='lower right',
                             fontsize=8)

        ax_t_vs_M.set_title(str(10**int(treatment)) + '-day transfers',
                            fontsize=17)

        #if treatment == '2':
        #    ax_M_vs_F.yaxis.set_major_locator(MaxNLocator(integer=True))

        if column_count == 0:

            ax_t_vs_M.set_ylabel('Mutations, ' + r'$M(t)$', fontsize=15)
            ax_t_vs_F.set_ylabel('Fixed mutations', fontsize=15)
            ax_t_vs_delta_M.set_ylabel('Change in mutations,\n' +
                                       r'$M(t)/M(t-1)$',
                                       fontsize=15)

            ax_t_vs_median_freq.set_ylabel(
                'Median mutation freq.\nat time $t$', fontsize=15)
            ax_t_vs_number_muts.set_ylabel('Number of mutations\nat time $t$',
                                           fontsize=15)

        column_count += 1

    fig.text(0.53, 0.05, 'Days, ' + r'$t$', ha='center', fontsize=28)
    fig.suptitle(pt.latex_genus_dict[taxon], fontsize=30)
    fig_name = pt.get_path() + '/figs/rate_%s.pdf' % taxon
    fig.savefig(fig_name,
                format='pdf',
                bbox_inches="tight",
                pad_inches=0.4,
                dpi=600)
    plt.close()
def plot_within_taxon_paralleliism(taxon, slope_null=1):

    fig = plt.figure(figsize=(12, 8))

    gene_data = parse_file.parse_gene_list(taxon)

    gene_names, gene_start_positions, gene_end_positions, promoter_start_positions, promoter_end_positions, gene_sequences, strands, genes, features, protein_ids = gene_data
    # to get the common gene names for each ID

    ax_multiplicity = plt.subplot2grid((2, 3), (0, 0), colspan=1)
    ax_mult_freq = plt.subplot2grid((2, 3), (0, 1), colspan=1)
    ax_venn = plt.subplot2grid((2, 3), (0, 2), colspan=1)

    ax_multiplicity.set_xscale('log', base=10)
    ax_multiplicity.set_yscale('log', base=10)
    ax_multiplicity.set_xlabel('Gene multiplicity, ' + r'$m$', fontsize=14)
    ax_multiplicity.set_ylabel('Fraction mutations ' + r'$\geq m$',
                               fontsize=14)
    ax_multiplicity.text(-0.1,
                         1.07,
                         pt.sub_plot_labels[0],
                         fontsize=18,
                         fontweight='bold',
                         ha='center',
                         va='center',
                         transform=ax_multiplicity.transAxes)

    ax_multiplicity.set_ylim([0.001, 1.1])
    ax_multiplicity.set_xlim([0.07, 130])

    ax_mult_freq.set_xscale('log', base=10)
    ax_mult_freq.set_yscale('log', base=10)
    ax_mult_freq.set_xlabel('Gene multiplicity, ' + r'$m$', fontsize=14)
    ax_mult_freq.set_ylabel('Mean maximum allele frequency, ' +
                            r'$\overline{f}_{max}$',
                            fontsize=11)
    ax_mult_freq.text(-0.1,
                      1.07,
                      pt.sub_plot_labels[1],
                      fontsize=18,
                      fontweight='bold',
                      ha='center',
                      va='center',
                      transform=ax_mult_freq.transAxes)

    ax_venn.axis('off')
    ax_venn.text(-0.1,
                 1.07,
                 pt.sub_plot_labels[2],
                 fontsize=18,
                 fontweight='bold',
                 ha='center',
                 va='center',
                 transform=ax_venn.transAxes)

    alpha_treatment_dict = {'0': 0.5, '1': 0.5, '2': 0.8}

    significant_multiplicity_dict = {}

    significant_multiplicity_values_dict = {}

    multiplicity_dict = {}

    g_score_p_label_dict = {}

    all_mults = []
    all_freqs = []

    treatments_in_taxon = []

    label_y_axes = [0.3, 0.2, 0.1]

    for treatment_idx, treatment in enumerate(pt.treatments):

        significan_multiplicity_taxon_path = pt.get_path(
        ) + '/data/timecourse_final/parallel_genes_%s.txt' % (treatment +
                                                              taxon)
        if os.path.exists(significan_multiplicity_taxon_path) == False:
            continue
        treatments_in_taxon.append(treatment)
        significan_multiplicity_taxon = open(
            significan_multiplicity_taxon_path, "r")

        significan_multiplicity_list = []
        for i, line in enumerate(significan_multiplicity_taxon):
            if i == 0:
                continue
            line = line.strip()
            items = line.split(",")
            significan_multiplicity_list.append(items[0])

            if items[0] not in significant_multiplicity_values_dict:
                significant_multiplicity_values_dict[items[0]] = {}
                significant_multiplicity_values_dict[
                    items[0]][treatment] = float(items[-2])
            else:
                significant_multiplicity_values_dict[
                    items[0]][treatment] = float(items[-2])

        significant_multiplicity_dict[treatment] = significan_multiplicity_list

        populations = [
            treatment + taxon + replicate for replicate in pt.replicates
        ]

        # Load convergence matrix
        convergence_matrix = parse_file.parse_convergence_matrix(
            pt.get_path() + '/data/timecourse_final/' +
            ("%s_convergence_matrix.txt" % (treatment + taxon)))
        gene_parallelism_statistics = mutation_spectrum_utils.calculate_parallelism_statistics(
            convergence_matrix, populations, Lmin=100)
        #print(gene_parallelism_statistics)
        G, pvalue = mutation_spectrum_utils.calculate_total_parallelism(
            gene_parallelism_statistics)

        sys.stdout.write("Total parallelism for %s = %g (p=%g)\n" %
                         (treatment + taxon, G, pvalue))

        predictors = []
        responses = []

        gene_hits = []
        gene_predictors = []
        mean_gene_freqs = []

        Ls = []

        ax_mult_freqs_x = []
        ax_mult_freqs_y = []

        for gene_name in convergence_matrix.keys():

            convergence_matrix[gene_name][
                'length'] < 50 and convergence_matrix[gene_name]['length']

            Ls.append(convergence_matrix[gene_name]['length'])
            m = gene_parallelism_statistics[gene_name]['multiplicity']

            if gene_name not in multiplicity_dict:
                multiplicity_dict[gene_name] = {}
                multiplicity_dict[gene_name][treatment] = m
            else:
                multiplicity_dict[gene_name][treatment] = m

            n = 0
            nfixed = 0
            freqs = []
            nf_max = 0

            for population in populations:
                for t, L, f, f_max in convergence_matrix[gene_name][
                        'mutations'][population]:
                    fixed_weight = timecourse_utils.calculate_fixed_weight(
                        L, f)

                    predictors.append(m)
                    responses.append(fixed_weight)

                    n += 1
                    nfixed += fixed_weight

                    # get freqs for regression
                    #if L == parse_file.POLYMORPHIC:
                    #freqs.append(f_max)
                    nf_max += timecourse_utils.calculate_fixed_weight(L, f_max)

            if n > 0.5:
                gene_hits.append(n)
                gene_predictors.append(m)
                #mean_gene_freqs.append(np.mean(freqs))

                if nf_max > 0:
                    ax_mult_freqs_x.append(m)
                    ax_mult_freqs_y.append(nf_max / n)

        Ls = np.asarray(Ls)
        ntot = len(predictors)
        mavg = ntot * 1.0 / len(Ls)

        predictors, responses = (np.array(x) for x in zip(
            *sorted(zip(predictors, responses), key=lambda pair: (pair[0]))))

        gene_hits, gene_predictors = (np.array(x) for x in zip(*sorted(
            zip(gene_hits, gene_predictors), key=lambda pair: (pair[0]))))

        rescaled_predictors = np.exp(np.fabs(np.log(predictors / mavg)))

        null_survival_function = mutation_spectrum_utils.NullMultiplicitySurvivalFunction.from_parallelism_statistics(
            gene_parallelism_statistics)

        # default base is 10
        theory_ms = np.logspace(-2, 2, 100)
        theory_survivals = null_survival_function(theory_ms)
        theory_survivals /= theory_survivals[0]

        sys.stderr.write("Done!\n")

        ax_multiplicity.plot(theory_ms,
                             theory_survivals,
                             lw=3,
                             color=pt.get_colors(treatment),
                             alpha=0.8,
                             ls=':',
                             zorder=1)

        ax_multiplicity.plot(
            predictors, (len(predictors) - np.arange(0, len(predictors))) *
            1.0 / len(predictors),
            lw=3,
            color=pt.get_colors(treatment),
            alpha=0.8,
            ls='--',
            label=str(int(10**int(treatment))) + '-day',
            drawstyle='steps',
            zorder=2)

        #ax_multiplicity.text(0.2, 0.3, g_score_p_label_dict['0'], fontsize=25, fontweight='bold', ha='center', va='center', transform=ax_multiplicity.transAxes)
        #ax_multiplicity.text(0.2, 0.2, g_score_p_label_dict['1'], fontsize=25, fontweight='bold', ha='center', va='center', transform=ax_multiplicity.transAxes)
        #ax_multiplicity.text(0.2, 0.1, g_score_p_label_dict['2'], fontsize=25, fontweight='bold', ha='center', va='center', transform=ax_multiplicity.transAxes)

        if pvalue < 0.001:
            pretty_pvalue = r'$\ll 0.001$'
        else:
            pretty_pvalue = '=' + str(round(pvalue, 4))

        g_score_p_label = r'$\Delta \ell_{{{}}}=$'.format(
            str(10**int(treatment))) + str(round(
                G, 3)) + ', ' + r'$P$' + pretty_pvalue

        text_color = pt.lighten_color(pt.get_colors(treatment), amount=1.3)

        ax_multiplicity.text(0.26,
                             label_y_axes[treatment_idx],
                             g_score_p_label,
                             fontsize=7,
                             ha='center',
                             va='center',
                             color='k',
                             transform=ax_multiplicity.transAxes)

        ax_mult_freq.scatter(ax_mult_freqs_x,
                             ax_mult_freqs_y,
                             color=pt.get_colors(treatment),
                             edgecolors=pt.get_colors(treatment),
                             marker=pt.plot_species_marker(taxon),
                             alpha=alpha_treatment_dict[treatment])

        all_mults.extend(ax_mult_freqs_x)
        all_freqs.extend(ax_mult_freqs_y)

        #slope, intercept, r_value, p_value, std_err = stats.linregress(np.log10(ax_mult_freqs_x), np.log10(ax_mult_freqs_y))
        #print(slope, p_value)

    # make treatment pairs
    treatments_in_taxon.sort(key=float)

    for i in range(0, len(treatments_in_taxon)):

        for j in range(i + 1, len(treatments_in_taxon)):

            ax_mult_i_j = plt.subplot2grid((2, 3), (1, i + j - 1), colspan=1)
            ax_mult_i_j.set_xscale('log', base=10)
            ax_mult_i_j.set_yscale('log', base=10)
            ax_mult_i_j.set_xlabel(str(10**int(treatments_in_taxon[i])) +
                                   '-day gene multiplicity, ' + r'$m$',
                                   fontsize=14)
            ax_mult_i_j.set_ylabel(str(10**int(treatments_in_taxon[j])) +
                                   '-day gene multiplicity, ' + r'$m$',
                                   fontsize=14)
            ax_mult_i_j.plot([0.05, 200], [0.05, 200],
                             lw=3,
                             c='grey',
                             ls='--',
                             zorder=1)
            ax_mult_i_j.set_xlim([0.05, 200])
            ax_mult_i_j.set_ylim([0.05, 200])

            ax_mult_i_j.text(-0.1,
                             1.07,
                             pt.sub_plot_labels[2 + i + j],
                             fontsize=18,
                             fontweight='bold',
                             ha='center',
                             va='center',
                             transform=ax_mult_i_j.transAxes)

            multiplicity_pair = [
                (multiplicity_dict[gene_name][treatments_in_taxon[i]],
                 multiplicity_dict[gene_name][treatments_in_taxon[j]])
                for gene_name in sorted(multiplicity_dict)
                if (multiplicity_dict[gene_name][treatments_in_taxon[i]] > 0)
                and (multiplicity_dict[gene_name][treatments_in_taxon[j]] > 0)
            ]
            significant_multiplicity_pair = [
                (significant_multiplicity_values_dict[gene_name][
                    treatments_in_taxon[i]],
                 significant_multiplicity_values_dict[gene_name][
                     treatments_in_taxon[j]])
                for gene_name in sorted(significant_multiplicity_values_dict)
                if (treatments_in_taxon[i] in
                    significant_multiplicity_values_dict[gene_name]) and (
                        treatments_in_taxon[j] in
                        significant_multiplicity_values_dict[gene_name])
            ]

            # get mean colors
            ccv = ColorConverter()

            color_1 = np.array(
                ccv.to_rgb(pt.get_colors(treatments_in_taxon[i])))
            color_2 = np.array(
                ccv.to_rgb(pt.get_colors(treatments_in_taxon[j])))

            mix_color = 0.7 * (color_1 + color_2)
            mix_color = np.min([mix_color, [1.0, 1.0, 1.0]], 0)

            if (treatments_in_taxon[i] == '0') and (treatments_in_taxon[j]
                                                    == '1'):
                #mix_color = pt.lighten_color(mix_color, amount=2.8)
                mix_color = 'gold'

            mult_i = [x[0] for x in multiplicity_pair]
            mult_j = [x[1] for x in multiplicity_pair]

            ax_mult_i_j.scatter(mult_i,
                                mult_j,
                                marker=pt.plot_species_marker(taxon),
                                facecolors=mix_color,
                                edgecolors='none',
                                alpha=0.8,
                                s=90,
                                zorder=2)

            mult_significant_i = [x[0] for x in significant_multiplicity_pair]
            mult_significant_j = [x[1] for x in significant_multiplicity_pair]
            ax_mult_i_j.scatter(mult_significant_i,
                                mult_significant_j,
                                marker=pt.plot_species_marker(taxon),
                                facecolors=mix_color,
                                edgecolors='k',
                                lw=1.5,
                                alpha=0.7,
                                s=90,
                                zorder=3)

            #slope_mult, intercept_mult, r_value_mult, p_value_mult, std_err_mult = stats.linregress(np.log10(mult_significant_i), np.log10(mult_significant_j))

            mult_ij = mult_significant_i + mult_significant_j + mult_i + mult_j

            ax_mult_i_j.set_xlim([min(mult_ij) * 0.5, max(mult_ij) * 1.5])
            ax_mult_i_j.set_ylim([min(mult_ij) * 0.5, max(mult_ij) * 1.5])

            # null slope of 1
            #ratio = (slope_mult - slope_null) / std_err_mult
            #p_value_mult_new_null = stats.t.sf(np.abs(ratio), len(mult_significant_j)-2)*2

            #if p_value_mult_new_null < 0.05:
            #    x_log10_fit_range =  np.linspace(np.log10(min(mult_i) * 0.5), np.log10(max(mult_i) * 1.5), 10000)

            #    y_fit_range = 10 ** (slope_mult*x_log10_fit_range + intercept_mult)
            #    ax_mult_i_j.plot(10**x_log10_fit_range, y_fit_range, c='k', lw=3, linestyle='--', zorder=4)

            #ax_mult_i_j.text(0.05, 0.9, r'$\beta_{1}=$'+str(round(slope_mult,3)), fontsize=12, transform=ax_mult_i_j.transAxes)
            #ax_mult_i_j.text(0.05, 0.82, r'$r^{2}=$'+str(round(r_value_mult**2,3)), fontsize=12, transform=ax_mult_i_j.transAxes)
            #ax_mult_i_j.text(0.05, 0.74, pt.get_p_value_latex(p_value_mult_new_null), fontsize=12, transform=ax_mult_i_j.transAxes)

    #if taxon == 'F':
    #    subset_tuple = (len( significant_multiplicity_dict['0']), \
    #                    len( significant_multiplicity_dict['1']), \
    #                    len(set(significant_multiplicity_dict['0']) & set(significant_multiplicity_dict['1'])))

    #    venn = venn2(subsets = subset_tuple, ax=ax_venn, set_labels=('', '', ''), set_colors=(pt.get_colors('0'), pt.get_colors('1')))
    #    c = venn2_circles(subsets=subset_tuple, ax=ax_venn, linestyle='dashed')

    subset_tuple = (len( significant_multiplicity_dict['0']), \
                    len( significant_multiplicity_dict['1']), \
                    len(set(significant_multiplicity_dict['0']) & set(significant_multiplicity_dict['1'])), \
                    len(significant_multiplicity_dict['2']), \
                    len(set(significant_multiplicity_dict['0']) & set(significant_multiplicity_dict['2'])), \
                    len(set(significant_multiplicity_dict['1']) & set(significant_multiplicity_dict['2'])),  \
                    len(set(significant_multiplicity_dict['1']) & set(significant_multiplicity_dict['1']) & set(significant_multiplicity_dict['2'])))

    venn = venn3(subsets=subset_tuple,
                 ax=ax_venn,
                 set_labels=('', '', ''),
                 set_colors=(pt.get_colors('0'), pt.get_colors('1'),
                             pt.get_colors('2')))
    c = venn3_circles(subsets=subset_tuple, ax=ax_venn, linestyle='dashed')

    ax_mult_freq.set_xlim([min(all_mults) * 0.5, max(all_mults) * 1.5])
    ax_mult_freq.set_ylim([min(all_freqs) * 0.5, max(all_freqs) * 1.5])

    fig.suptitle(pt.latex_dict[taxon], fontsize=30)

    fig.subplots_adjust(wspace=0.3)  #hspace=0.3, wspace=0.5
    fig_name = pt.get_path() + "/figs/multiplicity_%s.jpg" % taxon
    fig.savefig(fig_name,
                format='jpg',
                bbox_inches="tight",
                pad_inches=0.4,
                dpi=600)
    plt.close()
Esempio n. 12
0
        if treatment + taxon in pt.treatment_taxa_to_ignore:
            sys.stderr.write(
                "Skipping %s, too few surviving replicates ...\n" %
                (treatment + taxon))
            continue

        populations = [
            treatment + taxon + replicate for replicate in pt.replicates
        ]

        sys.stderr.write("Analyzing %s level parallelism for %s...\n" %
                         (level, treatment + taxon))

        # Load convergence matrix
        convergence_matrix = parse_file.parse_convergence_matrix(
            pt.get_path() + '/data/timecourse_final/' +
            ("%s_convergence_matrix.txt" % (treatment + taxon)))

        # Calculate basic parallellism statistics
        gene_parallelism_statistics_minor = mutation_spectrum_utils.calculate_parallelism_statistics(
            convergence_matrix, populations, fmax_max=0.5)
        gene_parallelism_statistics_major = mutation_spectrum_utils.calculate_parallelism_statistics(
            convergence_matrix, populations, fmax_min=0.5)

        # Do same thing for multiplicity statistic
        pooled_multiplicities_minor = numpy.array([
            gene_parallelism_statistics_minor[gene_name]['multiplicity']
            for gene_name in gene_parallelism_statistics_minor.keys()
        ])
        pooled_multiplicities_minor.sort()
        pooled_multiplicities_major = numpy.array([
Esempio n. 13
0
def calculate_divergence_correlations():

    sys.stdout.write("Starting divergence tests...\n")

    divergence_dict = {}

    for treatment_pair_idx, treatment_pair in enumerate(treatment_pairs):

        treatment_pair_set = (treatment_pair[0], treatment_pair[1])

        divergence_dict[treatment_pair_set] = {}


        if '1' in treatment_pair:
            taxa = ['B','C','D','F','P']
        else:
            taxa = pt.taxa


        for taxon in taxa:

            #result = [(x[treatment_pair[0]],x[treatment_pair[1]]) for x in significant_multiplicity_dict[taxon].values() if (treatment_pair[0] in x) and (treatment_pair[1] in x)]
            #result = [(x[treatment_pair[0]],x[treatment_pair[1]], x) for x in significant_n_mut_dict[taxon].values() if (treatment_pair[0] in x) and (treatment_pair[1] in x)]
            result = [(dicts[treatment_pair[0]],dicts[treatment_pair[1]], keys) for keys, dicts in significant_n_mut_dict[taxon].items() if (treatment_pair[0] in dicts) and (treatment_pair[1] in dicts)]

            n_x = [int(x[0]) for x in result]
            n_y = [int(x[1]) for x in result]
            gene_names = [x[2] for x in result]

            gene_sizes_taxon_treatment_pair = [gene_size_dict[taxon][gene_i] for gene_i in gene_names]
            gene_sizes_taxon_treatment_pair = np.asarray(gene_sizes_taxon_treatment_pair)
            taxon_Lmean = gene_mean_size_dict[taxon]

            n_matrix = np.asarray([n_x, n_y])
            mult_matrix = n_matrix * (taxon_Lmean / gene_sizes_taxon_treatment_pair)
            rel_mult_matrix = mult_matrix/mult_matrix.sum(axis=1)[:,None]
            pearsons_corr = np.corrcoef(rel_mult_matrix[0,:], rel_mult_matrix[1,:])[1,0]
            pearsons_corr_squared = pearsons_corr**2

            pearsons_corr_squared_null = []
            for k in range(permutations_divergence):

                if (k % 2000 == 0) and (k>0):

                    sys.stdout.write("%d iterations\n" % (k))

                n_matrix_random = phik.simulation.sim_2d_data_patefield(n_matrix)
                mult_matrix_random = n_matrix_random * (taxon_Lmean / gene_sizes_taxon_treatment_pair)
                rel_mult_matrix_random = mult_matrix_random/mult_matrix_random.sum(axis=1)[:,None]
                pearsons_corr_random = np.corrcoef(rel_mult_matrix_random[0,:], rel_mult_matrix_random[1,:])[1,0]
                pearsons_corr_squared_random = pearsons_corr_random**2

                pearsons_corr_squared_null.append(pearsons_corr_squared_random)

            pearsons_corr_squared_null = np.asarray(pearsons_corr_squared_null)

            Z_corr = (pearsons_corr_squared - np.mean(pearsons_corr_squared_null)) / np.std(pearsons_corr_squared_null)

            P_corr = (len(pearsons_corr_squared_null[pearsons_corr_squared_null<pearsons_corr_squared])+1) / (permutations_divergence+1)

            divergence_dict[treatment_pair_set][taxon] = {}
            divergence_dict[treatment_pair_set][taxon]['pearsons_corr_squared'] = pearsons_corr_squared
            divergence_dict[treatment_pair_set][taxon]['P_value'] = P_corr
            divergence_dict[treatment_pair_set][taxon]['Z_corr'] = Z_corr

            sys.stdout.write("%d vs %d-day, %s: rho^2=%f, P=%f, Z=%f\n" % (10**int(treatment_pair[0]), 10**int(treatment_pair[1]), taxon, pearsons_corr_squared, P_corr, Z_corr))


    sys.stdout.write("Dumping pickle......\n")
    with open(pt.get_path()+'/data/divergence_pearsons.pickle', 'wb') as handle:
        pickle.dump(divergence_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    sys.stdout.write("Done!\n")
Esempio n. 14
0
    #    gene = genes[gene_name_idx]
    #    if gene == '':
    #        continue
    #    locus_tag_to_gene_dict[gene_name] = genes[gene_name_idx]


    if taxon == 'J':
        treatments_convergence = ['0', '1']

    else:
        treatments_convergence = ['0', '1', '2']


    for treatment in treatments_convergence:

        genes_significant_file_path = pt.get_path() +'/data/timecourse_final/' +  ("parallel_%ss_%s.txt" % ('gene', treatment+taxon))
        genes_nonsignificant_file_path = pt.get_path() +'/data/timecourse_final/' +  ("parallel_not_significant_%ss_%s.txt" % ('gene', treatment+taxon))

        if os.path.exists(genes_significant_file_path) == False:
            continue

        genes_significant_file = open(genes_significant_file_path, 'r')
        first_line_significant = genes_significant_file.readline()

        N_significant_genes = 0

        genes = []

        for line in genes_significant_file:
            line_split = line.strip().split(', ')
            gene_name = line_split[0]
Esempio n. 15
0
         -0.01,
         'Maximum observed allele frequency, ' + r'$f_{max}$',
         ha='center',
         va='center',
         fontsize=18)
fig.text(-0.01,
         0.5,
         r'$pN/pS$' + ' for mutations ' + r'$\geq f_{max}$',
         ha='center',
         va='center',
         rotation='vertical',
         fontsize=18)

fig.subplots_adjust(hspace=0.4, wspace=0.6)  #hspace=0.3, wspace=0.5
fig.tight_layout()
fig.savefig(pt.get_path() + '/figs/dn_ds_fmax.pdf',
            format='pdf',
            bbox_inches="tight",
            pad_inches=0.4,
            dpi=600)
plt.close()

record_strs = [
    ",".join(
        ['treatment_pair', 'taxon', 'tree_name', 'mean_absolute_difference'])
]

msd_dict = {}
for taxon in taxa:

    if taxon == 'J':
# loop through taxa and get M(700) for all reps in each treatment
for treatment in pt.treatments:

    fmax_dict[treatment] = {}

for taxon in taxa:

    if taxon == 'J':
        treatments = ['0', '2']
    else:
        treatments = pt.treatments

    for treatment in treatments:

        convergence_matrix = parse_file.parse_convergence_matrix(
            pt.get_path() + '/data/timecourse_final/' +
            ("%s_convergence_matrix.txt" % (treatment + taxon)))

        f_max_all = []

        #for population in populations:
        for replicate in pt.replicates:
            population = treatment + taxon + replicate

            if population in pt.populations_to_ignore:
                continue

            for gene_name in sorted(convergence_matrix.keys()):

                for t, L, f, f_max in convergence_matrix[gene_name][
                        'mutations'][population]:
gene_dict = {}

gene_data = parse_file.parse_gene_list('B')
gene_names, gene_start_positions, gene_end_positions, promoter_start_positions, promoter_end_positions, gene_sequences, strands, genes, features, protein_ids = gene_data

locus_tag_to_gene_dict = {}
for gene_name_idx, gene_name in enumerate(gene_names):
    gene = genes[gene_name_idx]
    if gene == '':
        continue
    locus_tag_to_gene_dict[gene_name] = genes[gene_name_idx]

for taxon in taxa:
    for treatment in treatments:

        genes_significant_file_path = pt.get_path(
        ) + '/data/timecourse_final/' + ("parallel_%ss_%s.txt" %
                                         ('gene', treatment + taxon))
        output_notsignificant_file_path = pt.get_path(
        ) + '/data/timecourse_final/' + (
            "parallel_not_significant_%ss_%s.txt" %
            ('gene', treatment + taxon))

        if os.path.exists(genes_significant_file_path) == False:
            continue

        genes_significant_file = open(genes_significant_file_path, 'r')
        genes_notsignificant_file = open(output_notsignificant_file_path, 'r')
        first_line_significant = genes_significant_file.readline()
        first_line_notsignificant = genes_notsignificant_file.readline()

        for line in genes_significant_file:
def run_analyses():
    r2s_obs_dict = {}
    #r2s_null_dict = {}
    for treatment in ['0', '1', '2']:
        r2s_obs_dict[treatment] = {}
        for taxon in taxa:
            r2s_all = []
            ratio_f_all = []
            abs_delta_f_all = []
            for replicate in replicates:

                population = treatment + taxon + replicate
                sys.stderr.write("Processing %s...\n" % population)

                mutations, depth_tuple = parse_file.parse_annotated_timecourse(
                    population)
                population_avg_depth_times, population_avg_depths, clone_avg_depth_times, clone_avg_depths = depth_tuple
                state_times, state_trajectories = parse_file.parse_well_mixed_state_timecourse(
                    population)

                times = mutations[0][12]
                Ms = np.zeros_like(times) * 1.0
                fixed_Ms = np.zeros_like(times) * 1.0

                for mutation_idx_i in range(0, len(mutations)):

                    location_i, gene_name_i, allele_i, var_type_i, codon_i, position_in_codon_i, AAs_count_i, test_statistic_i, pvalue_i, cutoff_idx_i, depth_fold_change_i, depth_change_pvalue_i, times_i, alts_i, depths_i, clone_times_i, clone_alts_i, clone_depths_i = mutations[
                        mutation_idx_i]

                    state_Ls_i = state_trajectories[mutation_idx_i]
                    good_idx_i, filtered_alts_i, filtered_depths_i = timecourse_utils.mask_timepoints(
                        times_i, alts_i, depths_i, var_type_i, cutoff_idx_i,
                        depth_fold_change_i, depth_change_pvalue_i)
                    freqs_i = timecourse_utils.estimate_frequencies(
                        filtered_alts_i, filtered_depths_i)

                    masked_times_i = times[good_idx_i]
                    masked_freqs_i = freqs_i[good_idx_i]
                    masked_state_Ls_i = state_Ls_i[good_idx_i]

                    P_idx_i = np.where(masked_state_Ls_i == 3)[0]
                    if len(P_idx_i) < min_trajectory_length:
                        continue
                    first_P_i = P_idx_i[0]
                    last_P_i = P_idx_i[-1]

                    masked_freqs_P_i = masked_freqs_i[first_P_i:last_P_i + 1]
                    masked_times_P_i = masked_times_i[first_P_i:last_P_i + 1]

                    delta_masked_freqs_P_i = masked_freqs_P_i[
                        1:] - masked_freqs_P_i[:-1]
                    delta_masked_times_P_i = masked_times_P_i[:-1]

                    #abs_delta_f = np.absolute(freqs_i[1:] - freqs_i[:-1])
                    #freqs_i_no_zero = freqs_i[freqs_i>0]
                    # we want to get the ratio of freqs

                    for freqs_i_k, freqs_i_l in zip(freqs_i[1:], freqs_i[:-1]):
                        if (freqs_i_k == 0) or (freqs_i_l == 0):
                            continue
                        abs_delta_f_all.append(
                            np.absolute(freqs_i_k - freqs_i_l))
                        ratio_f_all.append(freqs_i_k / freqs_i_l)

                    #ratio_f = freqs_i_no_zero[]

                    for mutation_idx_j in range(mutation_idx_i + 1,
                                                len(mutations)):

                        location_j, gene_name_j, allele_j, var_type_j, codon_j, position_in_codon_j, AAs_count_j, test_statistic_j, pvalue_j, cutoff_jdx_j, depth_fold_change_j, depth_change_pvalue_j, times_j, alts_j, depths_j, clone_times_j, clone_alts_j, clone_depths_j = mutations[
                            mutation_idx_j]

                        state_Ls_j = state_trajectories[mutation_idx_j]
                        good_idx_j, filtered_alts_j, filtered_depths_j = timecourse_utils.mask_timepoints(
                            times_j, alts_j, depths_j, var_type_j,
                            cutoff_jdx_j, depth_fold_change_j,
                            depth_change_pvalue_j)
                        freqs_j = timecourse_utils.estimate_frequencies(
                            filtered_alts_j, filtered_depths_j)

                        masked_times_j = times[good_idx_j]
                        masked_freqs_j = freqs_j[good_idx_j]
                        masked_state_Ls_j = state_Ls_j[good_idx_j]

                        P_jdx_j = np.where(masked_state_Ls_j == 3)[0]
                        if len(P_jdx_j) < min_trajectory_length:
                            continue
                        first_P_j = P_jdx_j[0]
                        last_P_j = P_jdx_j[-1]

                        masked_freqs_P_j = masked_freqs_j[first_P_j:last_P_j +
                                                          1]
                        masked_times_P_j = masked_times_j[first_P_j:last_P_j +
                                                          1]

                        delta_masked_freqs_P_j = masked_freqs_P_j[
                            1:] - masked_freqs_P_j[:-1]
                        # delta_f = f_t_plus_1 - f_t
                        delta_masked_times_P_j = masked_times_P_j[:-1]

                        intersect_times = np.intersect1d(
                            delta_masked_times_P_i, delta_masked_times_P_j)

                        if len(intersect_times) >= 3:

                            intersect_idx_i = [
                                np.where(delta_masked_times_P_i ==
                                         intersect_time)[0][0]
                                for intersect_time in intersect_times
                            ]
                            intersect_delta_i = delta_masked_freqs_P_i[
                                intersect_idx_i]

                            intersect_idx_j = [
                                np.where(delta_masked_times_P_j ==
                                         intersect_time)[0][0]
                                for intersect_time in intersect_times
                            ]
                            intersect_delta_j = delta_masked_freqs_P_j[
                                intersect_idx_j]

                            if len(intersect_delta_i) != len(
                                    intersect_delta_j):
                                print(len(intersect_delta_j),
                                      len(intersect_delta_j))

                            r2 = stats.pearsonr(intersect_delta_i,
                                                intersect_delta_j)[0]**2
                            r2s_all.append(r2)

            r2s_all = np.asarray(r2s_all)
            ratio_f_all = np.asarray(ratio_f_all)
            abs_delta_f_all = np.asarray(abs_delta_f_all)

            #r2s_obs_dict[treatment + taxon] = {}
            #r2s_obs_dict[treatment + taxon]['r2'] = r2s_all
            #r2s_obs_dict[treatment + taxon]['ratio_f'] = ratio_f_all
            #r2s_obs_dict[treatment + taxon]['abs_delta_f'] = abs_delta_f_all

            r2s_obs_dict[treatment][taxon] = {}
            r2s_obs_dict[treatment][taxon]['r2'] = r2s_all
            r2s_obs_dict[treatment][taxon]['ratio_f'] = ratio_f_all
            r2s_obs_dict[treatment][taxon]['abs_delta_f'] = abs_delta_f_all

    with open(pt.get_path() + '/data/mutation_dynamics.pickle',
              'wb') as handle:
        pickle.dump(r2s_obs_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
            #r2s_obs_dict[treatment + taxon]['r2'] = r2s_all
            #r2s_obs_dict[treatment + taxon]['ratio_f'] = ratio_f_all
            #r2s_obs_dict[treatment + taxon]['abs_delta_f'] = abs_delta_f_all

            r2s_obs_dict[treatment][taxon] = {}
            r2s_obs_dict[treatment][taxon]['r2'] = r2s_all
            r2s_obs_dict[treatment][taxon]['ratio_f'] = ratio_f_all
            r2s_obs_dict[treatment][taxon]['abs_delta_f'] = abs_delta_f_all

    with open(pt.get_path() + '/data/mutation_dynamics.pickle',
              'wb') as handle:
        pickle.dump(r2s_obs_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


#run_analyses()
with open(pt.get_path() + '/data/mutation_dynamics.pickle', 'rb') as handle:
    r2s_obs_dict = pickle.load(handle)

analyses = ['abs_delta_f', 'ratio_f', 'r2']
# get KS distance
ks_dict = {}
p_value_list = []
for analysis in analyses:
    ks_dict[analysis] = {}
    for treatment_idx, treatment in enumerate(pt.treatments):

        ks_dict[analysis][treatment] = {}
        D, p_value = stats.ks_2samp(r2s_obs_dict[treatment]['B'][analysis],
                                    r2s_obs_dict[treatment]['S'][analysis])
        ks_dict[analysis][treatment]['D'] = D
        ks_dict[analysis][treatment]['p_value'] = p_value
Esempio n. 20
0
        replist, repnum = scipy.stats.find_repeats(X[i])
        for t in repnum:
            ties += t * (t * t - 1)

    c = 1 - ties / float(k * (k * k - 1) * n)
    Q /= c

    # Approximate the p-value
    ddof1 = k - 1
    p_unc = scipy.stats.chi2.sf(Q, ddof1)

    # Create output dataframe
    stats = pd.DataFrame({'Source': within,
                          'ddof1': ddof1,
                          'Q': np.round(Q, 3),
                          'p-unc': p_unc,
                          }, index=['Friedman'])

    col_order = ['Source', 'ddof1', 'Q', 'p-unc']

    stats = stats.reindex(columns=col_order)
    stats.dropna(how='all', axis=1, inplace=True)

    return stats



data = pd.read_csv(pt.get_path() +'/data/rm_anova.csv', sep=',' )

print(data)
Esempio n. 21
0
ntot_subsample = 50
subsamples = 10000
# ntot_subsample minimum number of mutations

G_subsample_dict = {}

G_all_mutations_dict = {}

for taxon in ['B', 'S']:

    for treatment in treatments:

        # Load convergence matrix
        convergence_matrix = parse_file.parse_convergence_matrix(
            pt.get_path() + '/data/timecourse_final/' +
            ("%s_convergence_matrix.txt" % (treatment + taxon)))

        populations = [
            treatment + taxon + replicate for replicate in replicates
        ]

        gene_parallelism_statistics = mutation_spectrum_utils.calculate_parallelism_statistics(
            convergence_matrix, populations, Lmin=100)

        G_subsample_list = []
        for i in range(subsamples):

            G_subsample = mutation_spectrum_utils.calculate_subsampled_total_parallelism(
                gene_parallelism_statistics, ntot_subsample=ntot_subsample)
def run_simulation():
    # weird sampling going on

    #4,292,969
    # mutation rate from Lynch paper, assume 10% sites are beneficial
    #mu = (3.28*10**-10 ) * 0.1
    #L =  4292969
    # keep order of magnitude for conveinance
    mu = (1.0 * 10**-10)
    L = 1000000

    N = 10**6
    M = 10
    K = N / M
    c = 0.00001
    s_scale = 10**-3

    # average time in a dormant state = M

    n_active_to_dormant = int(c * N)
    n_dormant_to_active = int(c * K * M)

    if n_active_to_dormant != n_dormant_to_active:
        print("Unqueal number of individuals switching states!!")

    # rate of entering dormancy, per-capita = c
    # rate of exiting dormancy, per-capita = c*K
    #d = (c* K) / N
    #r = c / M

    # double mutants slow the simulation so we're assuming single mutants
    # e.g., the largest lineage size = 10**6, generated L*mu*N (~1000) mutants per-generation
    # probability that an individual gets two mutations ~= 10 ** -7

    generations_to_sample = [330 * i for i in range(1, 11)]

    sampled_timepoints = {}

    generations = 3300

    n_clone_lineages = 0

    clone_size_dict = {}
    clone_size_dict[n_clone_lineages] = {}
    clone_size_dict[n_clone_lineages]['n_clone_active'] = N
    clone_size_dict[n_clone_lineages]['n_clone_dormant'] = M
    clone_size_dict[n_clone_lineages]['s'] = 1
    clone_size_dict[n_clone_lineages]['mutations'] = set([])

    # pre-assign fitness benefits to all sites
    all_sites = set(range(L))
    fitness_effects = numpy.random.exponential(scale=s_scale, size=L)

    # dict of what clones have a given mutation
    for generation in range(generations):
        # generate dormancy transition rates for all lineages
        # get keys and make sure they're in the same order
        #clones_active = [ clone_i for clone_i in clone_size_dict.keys() if ('n_clone_active' in clone_size_dict[clone_i]) and (clone_size_dict[clone_i]['n_clone_active'] > 0) ]
        #clones_active.sort()
        #clones_dormant = [ clone_i for clone_i in clone_size_dict.keys() if ('n_clone_dormant' in clone_size_dict[clone_i]) and (clone_size_dict[clone_i]['n_clone_dormant'] > 0)  ]
        #clones_dormant.sort()

        # get array of clone labels, the number of times each label is in the array is the size of the lineage
        clone_labels_active = [[int(clone_i)] *
                               clone_size_dict[clone_i]['n_clone_active']
                               for clone_i in clone_size_dict.keys()]
        clone_labels_dormant = [
            [int(clone_i)] * clone_size_dict[clone_i]['n_clone_dormant']
            for clone_i in clone_size_dict.keys()
            if ('n_clone_dormant' in clone_size_dict[clone_i]) and (
                clone_size_dict[clone_i]['n_clone_dormant'] > 0)
        ]

        clone_labels_active = numpy.concatenate(clone_labels_active).ravel()
        clone_labels_dormant = numpy.concatenate(clone_labels_dormant).ravel()

        clone_labels_active = clone_labels_active.astype(numpy.int)
        clone_labels_active = clone_labels_active.astype(numpy.int)

        # number of dormant individuals not constant???
        print(generation, len(clone_labels_active), len(clone_labels_dormant))
        active_to_dormant_sample = numpy.random.choice(
            clone_labels_active, size=n_active_to_dormant, replace=False)
        active_to_dormant_sample_bincount = numpy.bincount(
            active_to_dormant_sample)
        active_to_dormant_sample_bincount_nonzero = numpy.nonzero(
            active_to_dormant_sample_bincount)[0]

        dormant_to_active_sample = numpy.random.choice(
            clone_labels_dormant, size=n_dormant_to_active, replace=False)
        dormant_to_active_sample_bincount = numpy.bincount(
            dormant_to_active_sample)
        dormant_to_active_sample_bincount_nonzero = numpy.nonzero(
            dormant_to_active_sample_bincount)[0]

        for active_to_dormant_clone_i, active_to_dormant_n_clone_i in zip(
                active_to_dormant_sample_bincount_nonzero,
                active_to_dormant_sample_bincount[
                    active_to_dormant_sample_bincount_nonzero]):

            clone_size_dict[active_to_dormant_clone_i][
                'n_clone_active'] -= active_to_dormant_n_clone_i

            if 'n_clone_dormant' not in clone_size_dict[
                    active_to_dormant_clone_i]:
                clone_size_dict[active_to_dormant_clone_i][
                    'n_clone_dormant'] = 0

            clone_size_dict[active_to_dormant_clone_i][
                'n_clone_dormant'] += active_to_dormant_n_clone_i

        for dormant_to_active_clone_i, dormant_to_active_n_clone_i in zip(
                dormant_to_active_sample_bincount_nonzero,
                dormant_to_active_sample_bincount[
                    dormant_to_active_sample_bincount_nonzero]):

            clone_size_dict[dormant_to_active_clone_i][
                'n_clone_dormant'] -= dormant_to_active_n_clone_i

            if 'n_clone_dormant' not in clone_size_dict[
                    dormant_to_active_clone_i]:
                clone_size_dict[dormant_to_active_clone_i][
                    'n_clone_active'] = 0

            clone_size_dict[dormant_to_active_clone_i][
                'n_clone_active'] += dormant_to_active_n_clone_i

        # now move on to evolution
        for clone_i in list(clone_size_dict):

            if (clone_size_dict[clone_i]['n_clone_dormant'] == 0):

                if (clone_size_dict[clone_i]['n_clone_active'] == 0):
                    del clone_size_dict[clone_i]
                    continue

                else:
                    continue

            #print(clone_size_dict.keys())

            n_clone_i = clone_size_dict[clone_i]['n_clone_active']

            # mutation step#
            # lineage size can't be negative
            n_mutations_clone = min(numpy.random.poisson(mu * L * n_clone_i),
                                    n_clone_i)
            if n_mutations_clone == 0:
                continue
            # remove these individuals from the clone
            clone_size_dict[clone_i]['n_clone_active'] -= n_mutations_clone
            # all individuals in the clone have the same mutations
            # so just sample from nonmutated sites in the ancestral clone
            non_mutated_sites = all_sites - clone_size_dict[clone_i][
                'mutations']

            # sample without replacement
            #mutated_sites = random.sample(non_mutated_sites, n_mutations_clone)
            mutated_sites = numpy.random.choice(list(non_mutated_sites),
                                                size=n_mutations_clone,
                                                replace=False)
            #print(mutated_sites)
            #unique, counts = numpy.unique(mutated_sites, return_counts=True)
            for mutated_site in mutated_sites:

                n_clone_lineages += 1

                clone_size_dict[n_clone_lineages] = {}
                clone_size_dict[n_clone_lineages]['n_clone_active'] = 1
                clone_size_dict[n_clone_lineages]['n_clone_dormant'] = 0
                clone_size_dict[n_clone_lineages]['s'] = clone_size_dict[
                    clone_i]['s'] + fitness_effects[mutated_site]
                clone_size_dict[n_clone_lineages][
                    'mutations'] = clone_size_dict[clone_i]['mutations'].copy(
                    )
                clone_size_dict[n_clone_lineages]['mutations'].add(
                    mutated_site)

            #if (clone_size_dict[clone_i]['n_clone_active'] == 0) and (clone_size_dict[clone_i]['n_clone_dormant'] == 0):
            #    del clone_size_dict[clone_i]

        #sampling_numerator = numpy.asarray( [ clone_size_dict[clone_i]['n_clone']*numpy.exp(clone_size_dict[clone_i]['s']) for clone_i in sorted(clone_size_dict.keys())] )
        sampling_numerator = numpy.asarray([
            clone_size_dict[clone_i]['n_clone_active'] *
            numpy.exp(clone_size_dict[clone_i]['s'])
            for clone_i in clone_size_dict.keys()
        ])
        sampling_probability = sampling_numerator / sum(sampling_numerator)
        clone_sizes_after_selection = numpy.random.multinomial(
            N, sampling_probability)

        for clone_i_idx, clone_i in enumerate(list(clone_size_dict)):
            clone_i_size = clone_sizes_after_selection[clone_i_idx]

            #if clone_i_size == 0:
            #    del clone_size_dict[clone_i]
            #else:
            clone_size_dict[clone_i]['n_clone_active'] = clone_i_size

        if generation % 100 == 0:

            sys.stderr.write("%d generations...\n" % generation)

        if generation in generations_to_sample:
            clone_size_dict_copy = clone_size_dict.copy()
            sampled_timepoints[generation] = clone_size_dict_copy

        N = sum([
            clone_size_dict[x]['n_clone_active']
            for x in clone_size_dict.keys()
        ])
        M = sum([
            clone_size_dict[x]['n_clone_dormant']
            for x in clone_size_dict.keys()
        ])

        print(generation, N, M)

    saved_data_file = '%s/data/simulations/test2.dat' % (pt.get_path())

    with open(saved_data_file, 'wb') as outfile:
        pickle.dump(sampled_timepoints,
                    outfile,
                    protocol=pickle.HIGHEST_PROTOCOL)
from collections import Counter
from itertools import combinations

import scipy.stats as stats
import pandas as pd

import phylo_tools as pt

import parse_file
import timecourse_utils
import mutation_spectrum_utils
import phylo_tools as pt

import json

json_path = pt.get_path() + '/data/rebreseq_json/'

coverages_all = []
for filename in os.listdir(json_path):

    if filename.endswith(".json"):

        filepath = '%s%s' % (json_path, filename)

        with open(filepath) as f:
            data = json.load(f)

        #print(data.keys())

        #print()
Esempio n. 24
0
           markersize=10,
           color='w',
           markerfacecolor=pt.colors_dict['1']),
    Line2D([0], [0],
           marker='o',
           markersize=10,
           color='w',
           markerfacecolor=pt.colors_dict['2'])
]

axes[0].legend(custom_lines, ['1-day', '10-days', '100-day'],
               loc='upper right')

fig.subplots_adjust(hspace=0.4, wspace=0.6)  #hspace=0.3, wspace=0.5
fig.tight_layout()
fig.savefig(pt.get_path() + '/figs/mutation_spectra_pca.pdf',
            format='pdf',
            bbox_inches="tight",
            pad_inches=0.4,
            dpi=600)
plt.close()

reject, pvals_corrected, alphacSidak, alphacBonf = multitest.multipletests(
    anova_pvalues, alpha=0.05, method='fdr_bh')

fig = plt.figure(figsize=(9, 6))
gs = gridspec.GridSpec(nrows=2, ncols=3)
all_subplot_counts = 0
dn_ds_count = 0
for taxon_list_idx, taxon_list in enumerate([['B', 'C', 'D'], ['F', 'J',
                                                               'P']]):
Esempio n. 25
0
fmax_cutoffs = np.asarray([0, 0.2, 0.4, 0.6, 0.8])
G_dict_all = {}
taxa = ['B', 'C', 'D', 'F', 'J', 'P']
treatments = ['0', '1']
ntotal_dict = {}
for taxon in taxa:

    sys.stdout.write("Sub-sampling taxon: %s\n" % (taxon))

    G_dict_all[taxon] = {}
    if taxon == 'J':
        ntotal = 50
    else:
        # calculate ntot for all frequency cutoffs
        convergence_matrix = parse_file.parse_convergence_matrix(
            pt.get_path() + '/data/timecourse_final/' +
            ("%s_convergence_matrix.txt" % ('1' + taxon)))
        populations = ['1' + taxon + replicate for replicate in pt.replicates]
        gene_parallelism_statistics = mutation_spectrum_utils.calculate_parallelism_statistics(
            convergence_matrix, populations, fmax_min=max(fmax_cutoffs))
        ntotal = 0
        for gene_i, gene_parallelism_statistics_i in gene_parallelism_statistics.items(
        ):
            ntotal += gene_parallelism_statistics_i['observed']
    ntotal_dict[taxon] = ntotal
    for treatment in treatments:
        if treatment + taxon in pt.treatment_taxa_to_ignore:
            continue

        G_dict_all[taxon][treatment] = {}
Esempio n. 26
0
for treatment in treatments:
    pvalues = []
    for replicate in replicates:
        population = treatment + taxon + replicate

        if population in pt.populations_to_ignore:
            continue

        if population in pt.samples_to_remove:
            times_to_ignore = pt.samples_to_remove[population]
        else:
            times_to_ignore = None
        #file = open(input_filename_template % population,"r")
        likelihood_filename = '%s_likelihood_timecourse.bz' % (population)
        likelihood_timecourse_path = pt.get_path(
        ) + '/data/timecourse_likelihood/' + likelihood_filename
        file = bz2.open(likelihood_timecourse_path, "rt")
        file.readline()  # depth line!
        for line in file:

            items = line.split(",")
            location = int(items[1])

            total_times = np.array(
                [float(subitem) for subitem in items[3].split()])
            total_alts = np.array(
                [float(subitem) for subitem in items[4].split()])
            total_depths = np.array(
                [float(subitem) for subitem in items[5].split()])

            if 'None' in items[pvalue_idx].split()[0]:
                    all_poly_list.append((position, allele))

                num_processed_mutations += 1

                t = timecourse_utils.calculate_appearance_time(
                    masked_times, masked_freqs, masked_state_Ls)

                convergence_matrix[identifier]['mutations'][population].append(
                    (t, masked_state_Ls[-1], masked_freqs[-1],
                     max(masked_freqs)))

            sys.stderr.write("processed %d mutations!\n" %
                             num_processed_mutations)

        # Print it out
        output_filename = pt.get_path() + '/data/timecourse_final/' + (
            "%s_convergence_matrix.txt" % (treatment + taxon))

        convergence_matrix_file = open(output_filename, "w")

        # Header
        convergence_matrix_file.write(
            ", ".join(["Identifier"] + ["Size"] +
                      [population for population in populations]))

        for identifier in sorted(convergence_matrix.keys()):

            length = convergence_matrix[identifier]['length']
            mutations = convergence_matrix[identifier]['mutations']

            convergence_matrix_file.write("\n")
Esempio n. 28
0
def calculate_genome_length(taxon=None):
    reference_sequence = pt.classFASTA(pt.get_path() +'/'+ pt.get_ref_fna_dict()[taxon]).readFASTA()
    return sum([len(contig[1]) for contig in reference_sequence])
Esempio n. 29
0
ax.set_xticklabels(['1-day', '10-days', '100-days'],
                   fontweight='bold',
                   fontsize=12)

legend_elements = [
    Line2D([0], [0],
           color='none',
           marker='o',
           label=pt.latex_dict['B'],
           markerfacecolor='k',
           markersize=13),
    Line2D([0], [0],
           marker='o',
           color='none',
           label=pt.latex_dict['S'],
           markerfacecolor='w',
           markersize=13,
           markeredgewidth=2)
]
# Create the figure
ax.legend(handles=legend_elements, loc='upper right')

fig.subplots_adjust(hspace=0.3, wspace=0.5)
fig_name = pt.get_path() + '/figs/plot_dn_ds.jpg'
fig.savefig(fig_name,
            format='jpg',
            bbox_inches="tight",
            pad_inches=0.4,
            dpi=600)
plt.close()
Esempio n. 30
0
def merge_metadata():
    # first get dictionary for barcodes one and two for GSF2124, GSF2056
    GSF_files = [
        'GSF2056-run1-plates1-2-demultiplexing-summary',
        'GSF2056-run2-plates3-4-demultiplexing-summary',
        'SampleSheet-GSF2124-run3-plates1-2',
        'GSF2124 Lennon Run 3 Plates 3-4 Run Summary Sorted',
        'GSF2124-run5-plates5-6-demultiplexing-summay'
    ]
    ignore_lines = [
        'Undetermined', 'Sample', 'Lane', 'Sample_ID', ' Chemistry',
        'Description', 'Assay', 'Application', 'Workflow', 'Date',
        'Experiment Name', 'IEMFileVersion', 'Lane Summary',
        '"GSF2124 Lennon Plates 5-6', 'GSF2124-plates5-6-run5 Summary', '',
        'Chemistry', 'GSF2056-run2-plates3-4 Lennon Summary',
        'GSF2056-run1-plates1-2 Lennon/Shoemaker Summary'
    ]

    GSF_bc_dict = {}
    df_out = open(
        pt.get_path() + '/data/library_metadata/' + 'new_sample_names.txt',
        'w')
    meta_path = open(
        pt.get_path() + '/data/library_metadata/' + 'sample_names.txt', 'r')
    for GSF_file in GSF_files:
        GSF_file_ = open(
            pt.get_path() + '/data/library_metadata/' + GSF_file + '.csv', 'r')
        for GSF_line in GSF_file_:
            GSF_line = GSF_line.strip()  #.split(',')
            if len(GSF_line) < 20:
                continue
            GSF_line = GSF_line.split(',')
            if GSF_line[0] in ignore_lines:
                continue
            if GSF_line[2] == 'Undetermined':
                continue
            if GSF_line[0] == '1':
                GSF_line = GSF_line[2:]

            GSF_bc_dict[GSF_line[0]] = {}
            if '+' in GSF_line[1]:
                BC_split = GSF_line[1].split('+')
                GSF_BC1 = BC_split[0]
                GSF_BC2 = BC_split[1]
            else:
                GSF_BC1 = GSF_line[5]
                GSF_BC2 = GSF_line[7]
            GSF_bc_dict[GSF_line[0]]['BC1'] = GSF_BC1
            GSF_bc_dict[GSF_line[0]]['BC2'] = GSF_BC2

    for line in meta_path:
        line = line.strip()
        line_dash = line.split('/')
        run = line_dash[0]
        if 'GSF' not in run:
            run = 'HCGS' + run
        if '_' in run:
            run = run.replace('_', '-')
        file_name = line_dash[-1]
        file_name_spl = re.split('-|_', file_name)
        if file_name_spl[0] == 'GSF2124':
            gsf_bc_key = file_name.rsplit('_', 3)[0]
            BC1 = GSF_bc_dict[gsf_bc_key]['BC1']
            BC2 = GSF_bc_dict[gsf_bc_key]['BC2']
            if len(file_name_spl) == 9:
                pop = file_name_spl[4]
                day = file_name_spl[5][1:]
                R = file_name_spl[-2]

            elif len(file_name_spl) == 10:
                pop = file_name_spl[3] + file_name_spl[4] + file_name_spl[5]
                day = file_name_spl[6]
                R = file_name_spl[-2]

            elif len(file_name_spl) == 11:
                pop = file_name_spl[4] + file_name_spl[5] + file_name_spl[6]
                day = file_name_spl[7]
                R = file_name_spl[-2]

        elif file_name_spl[0] == 'GSF2056':
            gsf_bc_key = file_name.rsplit('_', 3)[0]
            BC1 = GSF_bc_dict[gsf_bc_key]['BC1']
            BC2 = GSF_bc_dict[gsf_bc_key]['BC2']
            if len(file_name_spl) == 13:
                pop = file_name_spl[3] + file_name_spl[7] + file_name_spl[8]
                day = file_name_spl[9]
                R = file_name_spl[-2]
                end = file_name_spl[-1]

            elif len(file_name_spl) == 14:
                pop = file_name_spl[4] + file_name_spl[8] + file_name_spl[9]
                day = file_name_spl[10]
                R = file_name_spl[-2]
                end = file_name_spl[-1]

        elif 'HCGS' in run:
            if len(file_name_spl) == 8:
                pop = file_name_spl[0]
                day = file_name_spl[1]
                BC1 = file_name_spl[3]
                BC2 = file_name_spl[4]
                R = file_name_spl[-2]
                end = file_name_spl[-1]

            elif len(file_name_spl) == 9:
                pop = file_name_spl[1] + file_name_spl[2]
                day = file_name_spl[3][1:]
                BC1 = file_name_spl[4]
                BC2 = file_name_spl[5]
                R = file_name_spl[-2]
                end = file_name_spl[-1]

            elif len(file_name_spl) == 6:
                pop = file_name_spl[0][1:]
                day = '100'
                BC1 = file_name_spl[1]
                BC2 = file_name_spl[2]
                R = file_name_spl[4]
                end = file_name_spl[-1]

            elif len(file_name_spl) == 7:
                pop = file_name_spl[0]
                day = file_name_spl[1]
                BC1 = file_name_spl[2]
                BC2 = file_name_spl[3]
                R = file_name_spl[5]
                end = file_name_spl[-1]
        if 'L' in pop:
            pop = pop.replace('L', '')
        new_name = '_'.join([run, pop, day, BC1, BC2, R, end])