Python get_path Beispiele, ltde_tools.get_path Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: clean_data.py Projekt: wrshoemaker/LTDE

def annotate_significant_genes():
    total_parallelism = open(lt.get_path() + '/data/breseq/gene_annotation.txt', "w")
    total_parallelism.write("\t".join(["Species", "locus_tag", "refseq_id", "annotation"]) +"\n" )
    taxa = ['ATCC13985', 'KBS0702', 'KBS0707', 'KBS0711', 'KBS0715',
                    'KBS0721', 'KBS0722', 'KBS0724', 'KBS0801']
    for taxon in taxa:
        locus_tags = []
        for line in open(lt.get_path() + '/data/breseq/mult_genes_nonsyn_sig/' + taxon + '.txt', 'r'):
            line_split = line.strip().split(',')
            if line_split[0] == 'Gene':
                continue
            locus_tags.append(line_split[0])
        # the refseq annotations don't map to KEGG annotated genes in the maple pathways
        # can't complete that analysis, just focus on refseq annotations
        # make refseq => KEGG dict
        #refseq_kegg_dict = {}
        #for line in open(lt.get_path() + '/data/genomes/genomes_ncbi_maple/' + taxon + '_MAPLE_result/query.fst.ko', 'r'):
        #    line_split = line.strip().split('\t')
        #    refseq_kegg_dict[line_split[0]] = line_split[1]
        ## get list of MAPLe modules to keep
        #df_modules = pd.read_csv(lt.get_path() + '/data/genomes/genomes_ncbi_maple_clean/' + taxon +'_maple_modules.txt', sep = '\t')
        #df_modules_mcr = df_modules.loc[df_modules['query(coverage)'] >= MCR]
        #modules_to_keep = df_modules_mcr.Pathway_ID.tolist()
        ## make KEGG => MAPLE dict
        #kegg_maple_dict = KO_to_module(taxon, modules_to_keep)
        # make locus tag  => refseq dict
        locus_tag_refseq_dict = {}
        for subdir, dirs, files in os.walk(lt.get_path() + '/data/genomes/genomes_ncbi/' + taxon):
            for file in files:
                if file.endswith('.gbff'):
                    with open(os.path.join(subdir, file), "rU") as input_handle:
                        for record in SeqIO.parse(input_handle, "genbank"):
                            for feature in record.features:
                                if feature.type != 'CDS':
                                    continue
                                if 'incomplete' in feature.qualifiers['note'][0]:
                                    continue
                                if 'frameshifted' in feature.qualifiers['note'][0]:
                                    continue
                                if 'internal stop' in feature.qualifiers['note'][0]:
                                    continue
                                gene_name = feature.qualifiers['locus_tag'][0]
                                inference = feature.qualifiers['inference'][0]
                                product = feature.qualifiers['product'][0]
                                if 'RefSeq' in inference:
                                    locus_tag_refseq_dict[gene_name] = [inference.split(':')[-1], product]

        # finally get maple annotation for the genes with significant # mutations
        for locus_tag in locus_tags:
            if locus_tag not in locus_tag_refseq_dict:
                continue
            refseq_annotation = locus_tag_refseq_dict[locus_tag]

            refseq_name = refseq_annotation[0].replace("_", "")

            total_parallelism.write("\t".join([taxon, locus_tag, refseq_name, refseq_annotation[1]]) + '\n')
    total_parallelism.close()

Beispiel #2

0

Datei anzeigen

Datei: clean_data.py Projekt: wrshoemaker/LTDE

def get_sites_to_remove(taxon):
    to_keep_samples = get_breseq_samples_to_keep()
    taxon_sites = []
    taxon_samples = [ x for x in to_keep_samples if x.startswith(taxon) ]
    fixed = []
    # first list all sites that are fixed in all replicate populations
    # these are most likely fixed in the ancestor
    for taxon_sample in taxon_samples:
        taxon_sample_sites = []
        for i, line in enumerate(open(lt.get_path() + '/data/breseq/annotated/' + taxon_sample + '.gd', 'r')):
            line_split = line.strip().split('\t')
            if line_split[0] in output_to_keep:
                # a lot of mutations at the first base of each contig, ignore these
                if line_split[4] == '1':
                    continue
                freq = float([x for x in line_split if 'frequency=' in x][0].split('=')[1])
                if freq == 1:
                    fixed.append(line_split[3] + '_' + str(line_split[4]))
                taxon_sample_sites.append( line_split[3] + '_' + str(line_split[4]))

        taxon_sites.extend(list(set( taxon_sample_sites )))

    count_fixed = Counter(fixed)
    count_fixed_all_reps = dict((k, v) for k, v in count_fixed.items() if v == len(taxon_samples))
    sites_to_remove_all_fixed = list(count_fixed_all_reps.keys())

    # see how many fixations have VARIANT_STRAND_COVERAGE flag
    # copy dict
    flag_fixed = copy.deepcopy(count_fixed)
    flag_fixed = {key:val for key, val in flag_fixed.items() if val < len(taxon_samples)-1}

    for taxon_sample in taxon_samples:
        taxon_sample_sites = []
        for i, line in enumerate(open(lt.get_path() + '/data/breseq/annotated/' + taxon_sample + '.gd', 'r')):
            line_split = line.strip().split('\t')
            if line_split[0] == 'RA':
                freq = float([x for x in line_split if 'frequency=' in x][0].split('=')[1])
                if ('VARIANT_STRAND_COVERAGE' in line) or ('SURROUNDING_HOMOPOLYMER' in line):
                    contig_site = line_split[3] + '_' + str(line_split[4])
                    if contig_site in flag_fixed:
                        del flag_fixed[contig_site]

    # everything breseq is calling as a fixed mutation has

    counts_all = Counter(taxon_sites)
    count_dict_to_remove = dict((k, v) for k, v in counts_all.items() if (v > 1 ) )
    sites_to_remove = list(count_dict_to_remove.keys())
    sites_to_remove_all = list(set(sites_to_remove + sites_to_remove_all_fixed))
    #print(taxon + ' proportion sites removed ' + str(round(len(sites_to_remove)/ len(counts_all.keys()), 3 )) )
    return sites_to_remove_all

Beispiel #3

0

Datei anzeigen

Datei: make_figs.py Projekt: wrshoemaker/LTDE

def piecewise_regression():

    df = pd.read_csv(
        lt.get_path() +
        '/data/demography/longtermdormancy_20190528_nocomments.csv',
        sep=',')
    # KBS0721 rep 3
    df['N'] = (df['Colonies'] + 1) * (1000 /
                                      df['Inoculum']) * (10**(df['Dilution']))

    df['Dormstart_date'] = pd.to_datetime(df['Dormstart_date'],
                                          format='%d-%b-%y')
    df['Firstread_date'] = pd.to_datetime(df['Firstread_date'],
                                          format='%d-%b-%y')
    df['Days'] = df['Firstread_date'].sub(df['Dormstart_date'], axis=0)
    df['Days'] = df['Days'].dt.days.astype('int')

    df_test = df[(df["Strain"] > 'KBS0721') & (df["Rep"] == 3)]

    x = df_test.Days.values
    y = np.log10(df_test.N.values)
    my_pwlf = pwlf.PiecewiseLinFit(x, y)

    # fit the data for four line segments
    res = my_pwlf.fit(2)

    # predict for the determined points
    xHat = np.linspace(min(x), max(x), num=10000)
    yHat = my_pwlf.predict(xHat)

Beispiel #4

0

Datei anzeigen

Datei: clean_data.py Projekt: wrshoemaker/LTDE

def clean_iRep(cutoff=2.5):
    # very low coverage for these taxa
    to_remove = ['KBS0705', 'KBS0706']
    directory = os.fsencode(lt.get_path() + '/data/iRep')
    df_out = open(lt.get_path() + '/data/iRep_clean.txt', 'w')
    header = ['Sample', 'Species', 'rep' ,'iRep']
    df_out.write('\t'.join(header) + '\n')
    iRep_corrected_dict = {}
    iRep_uncorrected_dict = {}
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith('.tsv'):
            iRep_path = os.path.join(str(directory, 'utf-8'), filename)
            strain = re.split(r'[.-]+', filename)[0]
            strain_rep = re.split(r'[.]+', filename)[0]
            if strain in to_remove:
                continue
            if 'W' in strain_rep:
                continue
            strain_rep = strain_rep[:-1] + str(lt.rename_rep()[strain_rep[-1]])
            if strain_rep == 'ATCC13985-4':
                continue
            for i, line in enumerate(open(iRep_path, 'r')):
                if i == 2:
                    last_item = line.strip().split()[-1]
                    if last_item == 'n/a':
                        iRep_corrected = float('nan')
                    else:
                        iRep_corrected = float(last_item)
                    iRep_corrected_dict[strain_rep] = [iRep_corrected]
                elif i == 6:
                    iRep_uncorrected = float(line.strip().split()[-1])
                    iRep_uncorrected_dict[strain_rep] = [iRep_uncorrected]
    for key, value in iRep_corrected_dict.items():
        value.extend(iRep_uncorrected_dict[key])
    for key, value in iRep_corrected_dict.items():
        if value[1] > 11:
            continue

        if math.isnan(value[0]) == True:
            iRep = value[1]
        else:
            iRep = value[0]
        out_line = [key, key.split('-')[0], key.split('-')[1], str(iRep)]
        df_out.write('\t'.join(out_line) + '\n')

    df_out.close()

Beispiel #5

0

Datei anzeigen

Datei: clean_data.py Projekt: wrshoemaker/LTDE

def get_assembly_coverage():
    df_out = open(lt.get_path() + '/data/genomes/assembly_coverage.txt', 'w')
    df_out.write('\t'.join(['Species', 'mean_coverage']) + '\n')
    assembly_path = lt.get_path() + '/data/genomes/nanopore_hybrid/'
    for file in os.listdir(assembly_path):
        filename = os.fsdecode(file)
        if filename.endswith('.fasta'):
            strain = filename.split('.')[0]
            print(strain)
            fa = lt.classFASTA(assembly_path+filename).readFASTA()
            fa_headers = [x[0].split('_') for x in fa]
            fa_headers = [x for x in fa_headers if int(x[3]) > 200]
            size = sum(int(x[3]) for x in fa_headers)
            weighted_mean_cov = (sum( [int(x[3]) * float(x[5]) for x in fa_headers]) / size)
            df_out.write('\t'.join([ strain, str(round(weighted_mean_cov, 3)) ]) + '\n')

    df_out.close()

Beispiel #6

0

Datei anzeigen

Datei: clean_data.py Projekt: wrshoemaker/LTDE

def get_16S_copy_number():
    genome_path = lt.get_path() + '/data/genomes/genomes_ncbi/'
    df_out = open(lt.get_path() + '/data/count_16S.txt', 'w')
    header = ['Species', 'Number_16S']
    df_out.write('\t'.join(header) + '\n')
    for subdir, dirs, files in os.walk(genome_path):
        for file in files:
            if file.endswith('.gbff'):
                strain = subdir.split('/')[-1]
                count_16S = 0
                with open(os.path.join(subdir, file), "rU") as input_handle:
                    for record in SeqIO.parse(input_handle, "genbank"):
                            for feature in record.features:
                                if feature.type == 'rRNA':
                                    if feature.qualifiers['product'][0] == '16S ribosomal RNA':
                                        count_16S+=1
                df_out.write('\t'.join([strain, str(count_16S)]) + '\n')
    df_out.close()

Beispiel #7

0

Datei anzeigen

Datei: clean_data.py Projekt: wrshoemaker/LTDE

def merge_maple(strain):
    maple_path = lt.get_path() + '/data/genomes/genomes_ncbi_maple/'
    IN_maple_sign_path = maple_path + strain + '_MAPLE_result/' + 'module_signature.tsv'
    IN_maple_sign = pd.read_csv(IN_maple_sign_path, sep = '\t')
    IN_maple_cmplx_path = maple_path + strain + '_MAPLE_result/' + 'module_complex.tsv'
    IN_maple_cmplx = pd.read_csv(IN_maple_cmplx_path, sep = '\t')
    IN_maple_pthwy_path = maple_path + strain + '_MAPLE_result/' + 'module_pathway.tsv'
    IN_maple_pthwy = pd.read_csv(IN_maple_pthwy_path, sep = '\t')
    IN_maple_fxn_path = maple_path + strain + '_MAPLE_result/' + 'module_function.tsv'
    IN_maple_fxn = pd.read_csv(IN_maple_fxn_path, sep = '\t')
    df_list = [IN_maple_cmplx, IN_maple_pthwy, IN_maple_sign]
    df_merged = IN_maple_fxn.append(df_list)
    # add column with pathway ID
    df_merged['Pathway_ID'] = df_merged['ID'].apply(lambda x: x.split('_')[0])
    df_merged_no_dup = df_merged.drop_duplicates(subset='Pathway_ID', keep="last")
    df_merged_no_dup = df_merged_no_dup.reset_index(drop=True)
    # median = median MCR
    OUT_path = lt.get_path() + '/data/genomes/genomes_ncbi_maple_clean/' + strain + '_maple_modules.txt'
    df_merged_no_dup.to_csv(OUT_path, sep = '\t', index = False)

Beispiel #8

0

Datei anzeigen

Datei: clean_data.py Projekt: wrshoemaker/LTDE

def merge_maple_all_strains(MCR = 0.8):
    dfs = []
    maple_path = lt.get_path() + '/data/genomes/genomes_ncbi_maple_clean/'
    for filename in os.listdir(maple_path):
        if filename.endswith("_maple_modules.txt"):
            df = pd.read_csv(maple_path + filename, sep = '\t')
            strain = filename.split('_')[0]
            df['Strain'] = strain
            dfs.append(df)

    dfs_concat = pd.concat(dfs)
    dfs_concat = dfs_concat.reset_index(drop=True)
    # remove rows that are less than 80% complete
    # query(coverage) = MCR % (ITR)
    # query(coverage/max) = MCR % (WC)
    # query(coverage/mode) = Q-value
    dfs_concat_050 = dfs_concat.loc[dfs_concat['query(coverage)'] >= MCR]
    module_by_taxon = pd.crosstab(dfs_concat_050.Pathway_ID, dfs_concat_050.Strain)
    module_by_taxon_no_redundant = module_by_taxon[(module_by_taxon.T != 1).any()]
    OUT_path = lt.get_path() + '/data/genomes/genomes_ncbi_maple.txt'
    module_by_taxon.to_csv(OUT_path, sep = '\t', index = True)

Beispiel #9

0

Datei anzeigen

Datei: clean_data.py Projekt: wrshoemaker/LTDE

def KO_to_module(strain, modules_to_keep = None):
    kaas_directory = lt.get_path() + '/data/genomes/genomes_ncbi_maple/' + strain + '_MAPLE_result/KAAS'
    bad_chars = '()-+,-'
    rgx = re.compile('[%s]' % bad_chars)
    kegg_maple_dict = {}
    for filename in os.listdir(kaas_directory):
        if filename.endswith("_matrix.txt"):
            for line in open((os.path.join(kaas_directory, filename)), 'r'):
                line_strip_split = line.strip().split()
                if len(line_strip_split) > 2 and 'M' in line_strip_split[0]:
                    if '_' in line_strip_split[0]:
                        pathway = line_strip_split[0].split('_')[0]
                    else:
                        pathway = line_strip_split[0]
                    # ignore modules that don't meet the MCR threshold
                    if modules_to_keep != None:
                        if pathway not in modules_to_keep:
                            continue
                    ko_genes = line_strip_split[2:]
                    for ko_gene in ko_genes:
                        test_set_member = [bad_char for bad_char in bad_chars if bad_char in ko_gene]
                        if len(test_set_member) > 0:
                            ko_gene_clean = rgx.sub('', ko_gene)
                            ko_gene_clean_split =  ['K' + e for e in ko_gene_clean.split('K') if e]
                            for split_gene in ko_gene_clean_split:
                                if 'M' in split_gene:
                                    continue
                                if split_gene in kegg_maple_dict:
                                    kegg_maple_dict[split_gene].append(pathway)
                                else:
                                    kegg_maple_dict[split_gene] = [pathway]
                        else:
                            if 'K' in ko_gene:
                                if ko_gene in kegg_maple_dict:
                                    kegg_maple_dict[ko_gene].append(pathway)
                                else:
                                    kegg_maple_dict[ko_gene] = [pathway]

    return kegg_maple_dict

Beispiel #10

0

Datei anzeigen

Datei: clean_data.py Projekt: wrshoemaker/LTDE

def get_breseq_samples_to_keep(cov_min=50):
    json_path = lt.get_path() + '/data/breseq/summary/'
    to_keep = []
    for filename in os.listdir(json_path):
        if filename.endswith(".json") == False:
            continue
        if 'ATCC43928' in filename:
            continue
        if 'KBS0727' in filename:
            continue
        with open(json_path + filename) as f:
            data = json.load(f)
            contigs = list(data['references']['reference'].keys())
            coverages = []
            for contig in contigs:
                if data['references']['reference'][contig]['length'] < 300:
                    continue
                coverages.append(data['references']['reference'][contig]['coverage_average'] )
            mean_cov = np.mean(coverages)
            if mean_cov > cov_min:
                to_keep.append(filename.split('.')[0])
    return to_keep

Beispiel #11

0

Datei anzeigen

             'c',
             fontsize=13,
             fontweight='bold',
             ha='center',
             va='center',
             transform=ax_mttd.transAxes)
ax_ext.text(-0.1,
            1.07,
            'd',
            fontsize=13,
            fontweight='bold',
            ha='center',
            va='center',
            transform=ax_ext.transAxes)

df_weibull = pd.read_csv(lt.get_path() +
                         '/data/demography/weibull_results_clean.csv',
                         sep=',')
df_CIs = pd.read_csv(lt.get_path() + '/data/demography/model_CIs.csv', sep=',')

model_features = open(lt.get_path() + '/data/demography/model_features.csv',
                      'r')
model_features.readline()
model_features_dict = {}
for line in model_features:
    line = line.strip().replace('"', '').split(',')
    model_features_dict[line[0]] = float(line[1])
model_features.close()

taxa = list(set(df_weibull.strain.to_list()))

Beispiel #12

0

Datei anzeigen

import matplotlib.ticker
import datetime as dt

#from sklearn.model_selection import GridSearchCV
#from sklearn.neighbors import KernelDensity
import statsmodels.stats.multitest as mt
import statsmodels.formula.api as smf

from Bio import SeqIO

from statsmodels.base.model import GenericLikelihoodModel

# only plot taxa w/ significant g scores and at least 100 mutations

df_irep = pd.read_csv(lt.get_path() + '/data/iRep_clean.txt', sep='\t')
df_irep = df_irep.rename(columns={'Species': 'strain'})
df_weib = pd.read_csv(lt.get_path() +
                      '/data/demography/weibull_results_clean.csv',
                      sep=',')
df_merged = df_weib.merge(df_irep, on=['strain', 'rep'])
taxa = list(set(df_merged.strain.to_list()))

df_merged['alpha_log10'] = np.log10(df_merged.alpha)

mf = smf.mixedlm("alpha_log10 ~ iRep", df_merged, groups=df_merged["strain"])
mf_fit = mf.fit()
print(mf_fit.summary())

irep_mean_list = []
shape_mean_list = []

Beispiel #13

0

Datei anzeigen

Datei: test_mixed_exponential.py Projekt: wrshoemaker/LTDE

    t_list = list(range(1000))

    for t in t_list:

        N_list.append(calculate_bi_exponential(N1, N2, d1, d2, t))

    print(colors[d1_idx])

    plt.plot(t_list,
             N_list,
             zorder=2,
             ls='--',
             label=r'$d_{1}/d_{2}=$' + str(d1 / d2),
             c=colors[d1_idx],
             lw=2)

plt.xlabel('Days, ' + r'$t$', fontsize=16)
plt.ylabel('Population size, ' + '$N(t)$', fontsize=16)
plt.yscale('log', base=10)

plt.legend(loc='upper right', prop={'size': 8})

#fig.savefig(lt.get_path() + '/figs/spoiie_death_curve.pdf', format='pdf', bbox_inches = "tight", pad_inches = 0.4, dpi = 600)
fig.savefig(lt.get_path() + '/figs/test_exponential.pdf',
            format='pdf',
            bbox_inches="tight",
            pad_inches=0.4,
            dpi=600)

plt.close()

Beispiel #14

0

Datei anzeigen

Datei: plot_proportion_dead_cells.py Projekt: wrshoemaker/LTDE

import matplotlib.ticker
import datetime as dt

#from sklearn.model_selection import GridSearchCV
#from sklearn.neighbors import KernelDensity
import statsmodels.stats.multitest as mt
import statsmodels.formula.api as smf

from Bio import SeqIO

from statsmodels.base.model import GenericLikelihoodModel

# only plot taxa w/ significant g scores and at least 100 mutations

df = pd.read_csv(lt.get_path() + '/data/staining.all.new.txt', sep='\t')
df = df[df.strain != "KBS0725"]
taxa = list(set(df.strain.to_list()))
to_remove = ['KBS0711W', 'KBS0727', 'KBS0714', 'KBS0701']
taxa = [x for x in taxa if x not in to_remove]
df_anc = df.loc[df['hist'] == 'anc']
df_der = df.loc[df['hist'] == 'der']
fig = plt.figure()
plt.axvline(1, color='dimgrey', lw=2, ls='--', zorder=1)
# first sort by mean
mean_list = []
for taxon in taxa:
    df_der_dead = df_der.loc[df_der['strain'] == taxon].dead.values
    delta_dead = df_der_dead - df_anc.loc[df_anc['strain'] ==
                                          taxon].dead.values[0]
    delta_dead_mean = np.mean(delta_dead)

Beispiel #15

0

Datei anzeigen

Datei: test_bi_exponential.py Projekt: wrshoemaker/LTDE

import matplotlib.lines as mlines

import matplotlib.ticker
import datetime as dt

#from sklearn.model_selection import GridSearchCV
#from sklearn.neighbors import KernelDensity
import statsmodels.stats.multitest as mt
import statsmodels.formula.api as smf

from Bio import SeqIO

from statsmodels.base.model import GenericLikelihoodModel

df_colors = pd.read_csv(lt.get_path() + '/data/colors.csv', sep=',')

import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")


#  weibull
def log_weibull(t, d_0, k):
    t = np.asarray(t)
    return np.exp(-1 * ((t * d_0)**k))


class log_weibull_model(GenericLikelihoodModel):
    def __init__(self, endog, exog, **kwds):

Beispiel #16

0

Datei anzeigen

Datei: plot_first_vs_second_order_difference.py Projekt: wrshoemaker/LTDE

import statsmodels.stats.multitest as mt
import statsmodels.formula.api as smf

from Bio import SeqIO

from statsmodels.base.model import GenericLikelihoodModel

to_remove_KBS0711 = [10, 11, 12]
all_taxa = [
    'KBS0703', 'ATCC13985', 'ATCC43928', 'KBS0701', 'KBS0702', 'KBS0705',
    'KBS0706', 'KBS0707', 'KBS0710', 'KBS0711', 'KBS0712', 'KBS0713',
    'KBS0714', 'KBS0715', 'KBS0721', 'KBS0722', 'KBS0724', 'KBS0725',
    'KBS0801', 'KBS0802', 'KBS0812'
]
df_counts = pd.read_csv(
    lt.get_path() +
    '/data/demography/longtermdormancy_20190528_nocomments.csv',
    sep=',')
df_counts['Abund'] = (df_counts.Colonies.values + 1) * (
    1000 / df_counts.Inoculum.values) * (10**df_counts.Dilution.values)
df_counts['Dormstart_date'] = pd.to_datetime(df_counts['Dormstart_date'])
df_counts['Firstread_date'] = pd.to_datetime(df_counts['Firstread_date'])
df_counts['Days'] = df_counts['Firstread_date'] - df_counts[
    'Dormstart_date'] + dt.timedelta(days=1)

fig, ax = plt.subplots(figsize=(4, 4))
fig.subplots_adjust(hspace=0.35, wspace=0.35)

for taxon in all_taxa:
    #taxon = 'KBS0812'
    taxon_color = lt.df_colors.loc[lt.df_colors['strain'] ==

Beispiel #17

0

Datei anzeigen

Datei: calculate_convergence_table.py Projekt: wrshoemaker/LTDE

    'KBS0722', 'KBS0724', 'KBS0801'
]
maple_types = ['signature', 'complex', 'pathway', 'function']

MCR = 0.8

kegg_dict_count = {}
maple_dict_count = {}
maple_annotation_dict = {}

treatment_count_dict = {}

for taxon in taxa:
    # make refseq to protein ID dict
    refseq_to_protein_dit = {}
    for subdir, dirs, files in os.walk(lt.get_path() +
                                       '/data/genomes/genomes_ncbi/' + taxon):
        for file in files:
            if file.endswith('.gbff'):
                with open(os.path.join(subdir, file), "rU") as input_handle:
                    for record in SeqIO.parse(input_handle, "genbank"):
                        for feature in record.features:
                            if feature.type != 'CDS':
                                continue
                            if 'incomplete' in feature.qualifiers['note'][0]:
                                continue
                            if 'frameshifted' in feature.qualifiers['note'][0]:
                                continue
                            if 'internal stop' in feature.qualifiers['note'][
                                    0]:
                                continue

Beispiel #18

0

Datei anzeigen

Datei: plot_afs.py Projekt: wrshoemaker/LTDE

#from sklearn.model_selection import GridSearchCV
#from sklearn.neighbors import KernelDensity
import statsmodels.stats.multitest as mt
import statsmodels.formula.api as smf

from Bio import SeqIO

from statsmodels.base.model import GenericLikelihoodModel

# only plot taxa w/ significant g scores and at least 100 mutations

n_bins = 20

afs_taxa_reps = [
    x.split('.')[0]
    for x in os.listdir(lt.get_path() + '/data/breseq/allele_freq_spec/')
    if x.endswith(".txt")
]
afs_taxa = list(
    set([
        x.split('.')[0].split('-')[0]
        for x in os.listdir(lt.get_path() + '/data/breseq/allele_freq_spec/')
        if x.endswith(".txt")
    ]))
_nsre = re.compile('([0-9]+)')


def natural_sort_key(s):
    return [
        int(text) if text.isdigit() else text.lower()
        for text in re.split(_nsre, s)

Beispiel #19

0

Datei anzeigen

Datei: plot_survival_likelihood.py Projekt: wrshoemaker/LTDE

                fontsize=13,
                fontweight='bold',
                ha='center',
                va='center',
                transform=ax_KBS0812.transAxes)
ax_likelihood.text(-0.1,
                   1.07,
                   'd',
                   fontsize=13,
                   fontweight='bold',
                   ha='center',
                   va='center',
                   transform=ax_likelihood.transAxes)

df_counts = pd.read_csv(
    lt.get_path() +
    '/data/demography/longtermdormancy_20190528_nocomments.csv',
    sep=',')
df_counts['Abund'] = (df_counts.Colonies.values + 1) * (
    1000 / df_counts.Inoculum.values) * (10**df_counts.Dilution.values)
df_counts['Dormstart_date'] = pd.to_datetime(df_counts['Dormstart_date'])
df_counts['Firstread_date'] = pd.to_datetime(df_counts['Firstread_date'])
df_counts['Days'] = df_counts['Firstread_date'] - df_counts[
    'Dormstart_date'] + dt.timedelta(days=1)
df_stats = pd.read_csv(lt.get_path() +
                       '/data/demography/weibull_results_clean.csv',
                       sep=',')

df_counts_KBS0714 = df_counts.loc[((df_counts['Strain'] == 'KBS0714') &
                                   (df_counts['Rep'] == 4))]
df_counts_KBS0703 = df_counts.loc[((df_counts['Strain'] == 'KBS0703') &

Beispiel #20

0

Datei anzeigen

Datei: clean_data.py Projekt: wrshoemaker/LTDE

def get_diversity_stats(afs_cutoff=30, mean_mut_cutoff=20):
    df_out = open(lt.get_path() + '/data/breseq/genetic_diversity.txt', 'w')
    df_out_header = ['Species', 'sample', 'rep', 'mean_freq', 'max_freq', \
                    'pi', 'theta', 'tajimas_d', 'dn_ds_total', \
                    'mean_N_mut', 'mean_binary_divisions', 'mean_gen_per_day', 'mean_birth_per_death', \
                    'max_N_mut', 'max_binary_divisions', 'max_gen_per_day', 'max_birth_per_death']

    df_out.write('\t'.join(df_out_header) + '\n')
    # pass nest list with frequency, coverage of major, coverage of minor, taxon
    #output_to_keep = ['INS', 'DEL', 'SNP']
    to_keep_samples = get_breseq_samples_to_keep()
    to_keep_taxa = get_breseq_taxa_to_keep()
    # all the diversity measures
    taxa_all = []
    n_muts_all = []
    mean_freq_list_all = []
    max_freq_list_all = []
    pi_list_all = []
    theta_list_all = []
    TD_list_all = []
    dnds_total_list_all = []
    tt_all = []
    p_value_all = []
    n_reps_all = []
    n_syn_non_muts_all = []

    # for tajimas d file
    n_reps_td_all = []
    tt_td_all = []
    p_value_td_all = []


    mean_N_mut_all = []
    max_N_mut_all = []
    binary_divisions_mean_all = []
    binary_divisions_max_all = []
    b_div_d_mean_all = []
    b_div_d_max_all = []
    mean_gen_per_day_all =  []
    max_gen_per_day_all = []
    for taxon in to_keep_taxa:
        if taxon == 'KBS0727':
            continue
        print(taxon)
        #effective_gene_lengths, Lsyn, Lnon, substitution_specific_synonymous_fraction = lt.calculate_synonymous_nonsynonymous_target_sizes(taxon)
        effective_gene_lengths, effective_gene_lengths_syn, Lsyn, Lnon, substitution_specific_synonymous_fraction = lt.calculate_synonymous_nonsynonymous_target_sizes(taxon)
        taxon_samples = [ x for x in to_keep_samples if x.startswith(taxon) ]
        sites_to_remove = get_sites_to_remove(taxon)
        genome_size = lt.get_genome_size_dict()[taxon]
        # list of diversity statistics
        mean_freq_list = []
        max_freq_list = []
        pi_list = []
        theta_list = []
        TD_list = []
        dnds_total_list = []
        n_muts_list = []
        n_syn_non_muts_list = []

        mean_N_mut_list = []
        max_N_mut_list = []
        binary_divisions_mean_list = []
        binary_divisions_max_list = []
        b_div_mean_d_list = []
        b_div_max_d_list = []
        mean_gen_per_day_list =  []
        max_gen_per_day_list = []
        for taxon_sample in taxon_samples:
            if taxon_sample == 'KBS0711-K':
                continue
            n_0_c, n_c = lt.get_init_final_pop_size(taxon_sample)
            # get SNP identifiers
            SNP_IDs = []
            fixed_SNP_IDs = []
            for i, line in enumerate(open(lt.get_path() + '/data/breseq/output/' + taxon_sample + '.gd', 'r')):
                line_split = line.strip().split('\t')
                if line_split[0] == 'SNP':
                    if line_split[3] + '_' + line_split[4] in sites_to_remove:
                        continue

                    # these are fixed in the ancestor, don't count as real fixations
                    # fixed mutations don't count towards polymorphisms
                    if float(line_split[6].split('=')[1]) == float(1):
                        fixed_SNP_IDs.append(line_split[2])
                    else:
                        SNP_IDs.append(line_split[2])

            # go back through the file again and get the coverage info from RA lines
            freq_list = []
            n_muts = 0
            print(taxon_sample, len(fixed_SNP_IDs), fixed_SNP_IDs)
            #for i, line in enumerate(open(lt.get_path() + '/data/breseq/output/' + taxon_sample + '.gd', 'r')):
            for i, line in enumerate(open(lt.get_path() + '/data/breseq/annotated/' + taxon_sample + '.gd', 'r')):
                line_split = line.strip().split('\t')
                #if (line_split[0] == 'RA') and (line_split[1] in SNP_IDs):
                if (line_split[0] in output_to_keep) and (line_split[2] in SNP_IDs):
                    #major_cov = int(line_split[15].split('=')[1].split('/')[0]) + int(line_split[15].split('=')[1].split('/')[1])
                    #minor_cov = int(line_split[18].split('=')[1].split('/')[0]) + int(line_split[18].split('=')[1].split('/')[1])
                    #total_cov = int(line_split[-1].split('=')[1].split('/')[0]) + int(line_split[-1].split('=')[1].split('/')[1])
                    #freq = float(line_split[20].split('=')[1])
                    #freq_list.append([freq, type, total_cov, major_cov, minor_cov])

                    freq = float([j for j in line_split if 'frequency=' in j][0].split('=')[1])
                    type = [j for j in line_split if 'mutation_category=' in j][0].split('=')[1]
                    freq_list.append([freq, type])
                    n_muts += 1

            # only look at the AFS in pops with at least 50 mutations
            if len(freq_list) >= afs_cutoff:
                # print allele frequencies to a file
                df_out_freq_taxa = open(lt.get_path() + '/data/breseq/allele_freq_spec/' + str(taxon_sample) + '.txt', 'w')
                #df_out_freq_taxa.write('\t'.join(['freq', 'total_cov', 'major_cov', 'minor_cov']) + '\n')
                df_out_freq_taxa.write('\t'.join(['frequency', 'mutation_category']) + '\n')
                for freq_list_i in freq_list:
                    #df_out_freq_taxa.write('\t'.join([str(freq_list_i[0]), str(freq_list_i[1]), str(freq_list_i[2]), str(freq_list_i[3])]) + '\n')
                    df_out_freq_taxa.write('\t'.join([str(freq_list_i[0]), str(freq_list_i[1])]) + '\n')
                df_out_freq_taxa.close()

            # only look at mean properties for pops with at least 20 mutations
            if len(freq_list) < mean_mut_cutoff:
                continue

            n_muts_list.append(n_muts)
            pi = lt.get_pi(freq_list, n_c=n_c, size=genome_size)
            theta = lt.get_theta(freq_list, n_c=n_c, size=genome_size)
            mean_freq = np.mean([ float(i[0]) for i in  freq_list])
            max_freq = max([ float(i[0]) for i in  freq_list])
            # print all frequencies to a file

            # genome size cancels out during the TD calculation
            tajimas_d = lt.get_TD(freq_list=freq_list, pi=pi*genome_size, theta=theta*genome_size, n_c=n_c)
            non_total = 0
            syn_total = 0
            non_fixed = 0
            syn_fixed = 0
            n_syn_non_muts = 0
            for i, line in enumerate(open(lt.get_path() + '/data/breseq/annotated/' + taxon_sample + '.gd', 'r')):
                line_split = line.strip().split('\t')
                # don't count mutations that may be ancestral
                # don't count mutations in non-coding regions or psuedoregions
                if (line_split[0] != 'SNP') or ('frequency' in line_split[6]) or (line_split[3] + '_' + line_split[4] in sites_to_remove):
                    continue
                freq = float([s for s in line_split if 'frequency=' in s][0].split('=')[1])
                n_syn_non_muts += 1
                if freq == float(1):
                    if line_split[6].split('=')[1] == line_split[8].split('=')[1]:
                        syn_fixed += 1
                    else:
                        non_fixed += 1
                if line_split[6].split('=')[1] == line_split[8].split('=')[1]:
                    syn_total += 1
                else:
                    non_total += 1
            # add psuedocount of 1
            n_syn_non_muts_list.append(n_syn_non_muts)
            dnds_total = ((non_total+1)/(syn_total+1))/((Lnon+1)/(Lsyn+1))
            dnds_fixed = ((non_fixed+1)/(syn_fixed+1))/((Lnon+1)/(Lsyn+1))

            mean_freq_list.append(mean_freq)
            max_freq_list.append(max_freq)
            pi_list.append(pi)
            theta_list.append(theta)
            TD_list.append(tajimas_d)
            dnds_total_list.append(dnds_total)
            # number divisions
            mean_N_mut = n_c*mean_freq
            max_N_mut = n_c*max_freq
            binary_divisions_mean = sum([2**i for i in range(int( math.floor(np.log2(mean_N_mut)) ))]) / 2
            binary_divisions_max = sum([2**i for i in range(int( math.floor(np.log2(max_N_mut)) ))]) / 2

            binary_divisions_mean_list.append(binary_divisions_mean)
            binary_divisions_max_list.append(binary_divisions_max)

            mean_N_mut_list.append(mean_N_mut)
            max_N_mut_list.append(max_N_mut)

            #b_div_mean_d = binary_divisions_mean / (n_0_c - n_c)
            b_div_mean_d = binary_divisions_mean /  n_c
            b_div_mean_d_list.append(b_div_mean_d)
            #b_div_max_d = binary_divisions_max / (n_0_c - n_c)
            b_div_max_d = binary_divisions_max /  n_c
            b_div_max_d_list.append(b_div_max_d)

            rep_num = lt.rename_rep()[taxon_sample.split('-')[1]]
            time = lt.get_total_time(taxon_sample)

            mean_N_mut = n_c*mean_freq
            mean_gens_per_day = np.log2(mean_N_mut)/time

            max_N_mut = n_c*max_freq
            max_gens_per_day = np.log2(max_N_mut)/time



            mean_gen_per_day_list.append(mean_gens_per_day)
            max_gen_per_day_list.append(max_gens_per_day)

            df_out_data_list = [taxon, taxon_sample, str(rep_num), str(mean_freq), str(max_freq), \
                                str(pi), str(theta), str(tajimas_d), str(dnds_total), \
                                str(mean_N_mut), str(binary_divisions_mean), str(mean_gens_per_day), str(b_div_mean_d),
                                str(max_N_mut), str(binary_divisions_max), str(max_gens_per_day), str(b_div_max_d)]

            df_out.write('\t'.join(df_out_data_list) + '\n')

        # get taxon level stats
        print(  str(len(dnds_total_list)) + " reps")
        # only examine dn/ds for taxa with at least three reps
        if len(dnds_total_list) < 3:
            continue

        n_muts_all.append(np.mean(n_muts_list))
        n_syn_non_muts_all.append(np.mean(n_syn_non_muts_list))
        mean_freq_list_all.append(np.mean(mean_freq_list))
        max_freq_list_all.append(np.mean(max_freq_list))
        mean_N_mut_all.append(np.mean(mean_N_mut_list) )
        max_N_mut_all.append(np.mean(max_N_mut_list) )

        pi_list_all.append(np.mean(pi_list))
        theta_list_all.append(np.mean(theta_list))
        TD_list_all.append(np.mean(TD_list))

        mean_dnds_total = np.mean(dnds_total_list)
        dnds_total_list_all.append(mean_dnds_total)
        binary_divisions_mean_all.append(np.mean(binary_divisions_mean_list))
        binary_divisions_max_all.append(np.mean(binary_divisions_max_list))
        b_div_d_mean_all.append(np.mean(b_div_mean_d_list))
        b_div_d_max_all.append(np.mean(b_div_max_d_list))

        mean_gen_per_day_all.append(np.mean(mean_gen_per_day_list))
        max_gen_per_day_all.append(np.mean(max_gen_per_day_list))

        taxa_all.append(taxon)

        # t > 0, right-tailed t test, use survival function
        # t < 0, left-tailed t test, use CDF
        # or just take absolute value of t and use SF

        tt = (mean_dnds_total-1)/ (np.std(dnds_total_list) / np.sqrt(float(len(dnds_total_list))))
        p_val = t.sf(np.abs(tt), len(dnds_total_list)-1) # left-tailed one-sided t-test, so use CDF
        n_reps_all.append(len(dnds_total_list))
        tt_all.append(tt)
        p_value_all.append(p_val)


        tt_td = (np.mean(TD_list))/ (np.std(TD_list) / np.sqrt(float(len(TD_list))))
        p_val_td = t.sf(np.abs(tt_td), len(TD_list)-1) # left-tailed one-sided t-test, so use CDF
        n_reps_td_all.append(len(dnds_total_list))
        tt_td_all.append(tt_td)
        p_value_td_all.append(p_val_td)


    df_out.close()

    reject, pvals_corrected, alphacSidak, alphacBonf = mt.multipletests(p_value_all, alpha=0.05, method='fdr_bh')

    reject_td, pvals_corrected_td, alphacSidak_td, alphacBonf_td = mt.multipletests(p_value_td_all, alpha=0.05, method='fdr_bh')

    # two files, one for dnds one for rest of diversity stats
    df_out_taxa = open(lt.get_path() + '/data/breseq/birth_estimate_taxa.txt', 'w')
    df_out_taxa_heder = ['Species', 'mean_n_muts', 'mean_freq', 'max_freq', 'Theta', 'Pi', 'Tajimas_D', \
                        'mean_N_mut', 'mean_binary_divisions', 'mean_gen_per_day', 'mean_birth_per_death', \
                        'max_N_mut', 'max_binary_divisions', 'max_gen_per_day', 'max_birth_per_death']

    df_out_taxa.write('\t'.join(df_out_taxa_heder) + '\n')
    for i in range(len(taxa_all)):
        out_list_i = [taxa_all[i], str(n_muts_all[i]), str(mean_freq_list_all[i]), \
            str(max_freq_list_all[i]), str(theta_list_all[i]), str(pi_list_all[i]), str(TD_list_all[i]), \
            str(mean_N_mut_all[i]), str(binary_divisions_mean_all[i]), str(mean_gen_per_day_all[i]), str(b_div_d_max_all[i]), \
            str(max_N_mut_all[i]), str(binary_divisions_max_all[i]), str(max_gen_per_day_all[i]), str(b_div_d_mean_all[i]) ]
        df_out_taxa.write('\t'.join(out_list_i) + '\n')
    df_out_taxa.close()

    df_dNdS_taxa = open(lt.get_path() + '/data/breseq/dN_dS_taxa.txt', 'w')
    df_dNdS_taxa.write('\t'.join(['Species', 'n_reps', 'n_syn_non_muts', 'dN_dS_total', 't_stat', 'p_BH']) + '\n')
    for i in range(len(taxa_all)):
        df_dNdS_taxa.write('\t'.join([taxa_all[i], str(n_reps_all[i]), str(n_syn_non_muts_all[i]), str(dnds_total_list_all[i]), str(tt_all[i]), str(pvals_corrected[i])]) + '\n')
    df_dNdS_taxa.close()


    df_td_taxa = open(lt.get_path() + '/data/breseq/tajimas_d_taxa.txt', 'w')
    df_td_taxa.write('\t'.join(['Species', 'n_reps', 'n_muts', 'tajimas_d', 't_stat', 'p_BH']) + '\n')
    for i in range(len(taxa_all)):
        df_td_taxa.write('\t'.join([taxa_all[i], str(n_reps_td_all[i]), str(n_muts_all[i]), str(TD_list_all[i]), str(tt_td_all[i]), str(pvals_corrected_td[i])]) + '\n')
    df_td_taxa.close()

Beispiel #21

0

Datei anzeigen

Datei: plot_tajimas_d.py Projekt: wrshoemaker/LTDE

import matplotlib.ticker
import datetime as dt

#from sklearn.model_selection import GridSearchCV
#from sklearn.neighbors import KernelDensity
import statsmodels.stats.multitest as mt
import statsmodels.formula.api as smf

from Bio import SeqIO

from statsmodels.base.model import GenericLikelihoodModel

# only plot taxa w/ significant g scores and at least 100 mutations

df_taxa = pd.read_csv(lt.get_path() + '/data/breseq/tajimas_d_taxa.txt',
                      sep='\t')
df_taxa = df_taxa.sort_values(by=['tajimas_d'])
taxa_to_keep = df_taxa.Species.to_list()
df_taxa_samples = pd.read_csv(lt.get_path() +
                              '/data/breseq/genetic_diversity.txt',
                              sep='\t',
                              index_col=None)
fig = plt.figure()
for i, taxon in enumerate(taxa_to_keep):
    #print(taxon)
    #print(df_taxa_samples.loc[df_taxa_samples['Species'] == taxon])
    x_i = df_taxa_samples.loc[df_taxa_samples['Species'] ==
                              taxon].tajimas_d.values
    if len(x_i) < 3:
        taxa_to_keep.remove(taxon)

Beispiel #22

0

Datei anzeigen

            d_0_start = 0.01
            k_start = 1
            z_start = 0.8

            start_params = np.array([d_0_start, k_start, z_start])

        return super(log_weibull_model, self).fit(start_params=start_params,
                                                  maxiter=maxiter,
                                                  method=method,
                                                  maxfun=maxfun,
                                                  **kwds)


# innocula 100uL
inoccula = 100
df = pd.read_csv(lt.get_path() + '/data/demography/spoIIE_DC_assay.csv',
                 sep=',')

#df = pd.read_csv(lt.get_path() + '/data/demography/spo0IIE_assay.csv', sep = ',')

df['N_spores'] = df['HT'] * (1000 /
                             inoccula) * (10**(df['dilution_S'])) * 50  #(mL)
df['N_total'] = df['NT'] * (1000 /
                            inoccula) * (10**(df['dilution_V'])) * 50  #(mL)
df['N_viable'] = df['N_total'] - df['N_spores']
df['days'] = df['hours'] / 24
df = df.sort_values('days')

df_wt = df[(df['strain'] == 'wt')]
df_spoiie = df[(df['strain'] == 'SpoIIE')]

Beispiel #23

0

Datei anzeigen

Datei: clean_data.py Projekt: wrshoemaker/LTDE

def run_parallelism_analysis(nmin_reps=3, nmin = 2, FDR = 0.05, n_nonsyn_min=50):
    # pass nest list with frequency, coverage of major, coverage of minor, taxon
    #output_to_keep = ['INS', 'DEL', 'SNP', 'SUB']
    to_keep_samples = get_breseq_samples_to_keep()
    to_keep_taxa = get_breseq_taxa_to_keep()
    p_star_dict = {}
    G_score_list = []
    for taxon in to_keep_taxa:
        print(taxon)
        effective_gene_lengths, effective_gene_lengths_syn, Lsyn, Lnon, substitution_specific_synonymous_fraction = lt.calculate_synonymous_nonsynonymous_target_sizes(taxon)
        taxon_sites = []
        taxon_samples = [ x for x in to_keep_samples if x.startswith(taxon) ]
        sites_to_remove = get_sites_to_remove(taxon)
        # keep insertion, deletions, and nonsynonymous SNPs
        # get size_dict
        gene_count_dict = {}
        gene_count_syn_dict = {}
        #print(sites_to_remove)
        for taxon_sample in taxon_samples:
            for i, line in enumerate(open(lt.get_path() + '/data/breseq/annotated/' + taxon_sample + '.gd', 'r')):
                line_split = line.strip().split('\t')
                if line_split[0] == '#=GENOME_DIFF':
                    continue
                if (line_split[3] + '_' + line_split[4] in sites_to_remove):
                    continue
                if (line_split[0] not in output_to_keep): #or ('frequency' in line_split[6]) or (line_split[3] + '_' + line_split[4] in sites_to_remove):
                    continue
                if line_split[0] == 'SNP':
                    if [s for s in line_split if 'snp_type=' in s][0].split('=')[1] == 'nonsynonymous':
                        locus_tag = [s for s in line_split if 'locus_tag=' in s][0].split('=')[1]
                        frequency = float([s for s in line_split if 'frequency=' in s][0].split('=')[1])
                        if ';' in locus_tag:
                            for locus_tag_j in locus_tag.split(';'):
                                if locus_tag_j not in gene_count_dict:
                                    gene_count_dict[locus_tag_j] = {}
                                    gene_count_dict[locus_tag_j]['freqs'] = []
                                    gene_count_dict[locus_tag_j]['n_mut'] = 0

                                gene_count_dict[locus_tag_j]['n_mut'] += 1
                                gene_count_dict[locus_tag_j]['freqs'].append(frequency)

                        else:
                            if locus_tag not in gene_count_dict:
                                #gene_count_dict[locus_tag] = 1
                                gene_count_dict[locus_tag] = {}
                                gene_count_dict[locus_tag]['freqs'] = []
                                gene_count_dict[locus_tag]['n_mut'] = 0

                            gene_count_dict[locus_tag]['n_mut'] += 1
                            gene_count_dict[locus_tag]['freqs'].append(frequency)


                    elif [s for s in line_split if 'snp_type=' in s][0].split('=')[1] == 'synonymous':
                        locus_tag = [s for s in line_split if 'locus_tag=' in s][0].split('=')[1]
                        frequency = float([s for s in line_split if 'frequency=' in s][0].split('=')[1])
                        if ';' in locus_tag:
                            for locus_tag_j in locus_tag.split(';'):
                                if locus_tag_j not in gene_count_syn_dict:
                                    gene_count_syn_dict[locus_tag_j] = {}
                                    gene_count_syn_dict[locus_tag_j]['freqs'] = []
                                    gene_count_syn_dict[locus_tag_j]['n_mut'] = 0

                                gene_count_syn_dict[locus_tag_j]['n_mut'] += 1
                                gene_count_syn_dict[locus_tag_j]['freqs'].append(frequency)

                        else:
                            if locus_tag not in gene_count_syn_dict:
                                gene_count_syn_dict[locus_tag] = {}
                                gene_count_syn_dict[locus_tag]['freqs'] = []
                                gene_count_syn_dict[locus_tag]['n_mut'] = 0

                            gene_count_syn_dict[locus_tag]['n_mut'] += 1
                            gene_count_syn_dict[locus_tag]['freqs'].append(frequency)
                    else:
                        continue
                else:
                    if len([s for s in line_split if 'gene_position=coding' in s]) >= 1:
                        locus_tag = [s for s in line_split if 'locus_tag=' in s][0].split('=')[1]
                        frequency = float([s for s in line_split if 'frequency=' in s][0].split('=')[1])
                        if ';' in locus_tag:
                            for locus_tag_j in locus_tag.split(';'):

                                if locus_tag_j not in gene_count_dict:
                                    gene_count_dict[locus_tag_j] = {}
                                    gene_count_dict[locus_tag_j]['freqs'] = []
                                    gene_count_dict[locus_tag_j]['n_mut'] = 0

                                gene_count_dict[locus_tag_j]['freqs'].append(frequency)
                                gene_count_dict[locus_tag_j]['n_mut'] += 1

                        else:
                            if locus_tag not in gene_count_dict:
                                #gene_count_dict[locus_tag] = 1
                                gene_count_dict[locus_tag] = {}
                                gene_count_dict[locus_tag]['freqs'] = []
                                gene_count_dict[locus_tag]['n_mut'] = 0

                            gene_count_dict[locus_tag]['freqs'].append(frequency)
                            gene_count_dict[locus_tag]['n_mut'] += 1

        gene_parallelism_statistics = {}
        for gene_i, length_i in effective_gene_lengths.items():
            gene_parallelism_statistics[gene_i] = {}
            gene_parallelism_statistics[gene_i]['length'] = length_i
            gene_parallelism_statistics[gene_i]['observed'] = 0
            gene_parallelism_statistics[gene_i]['multiplicity'] = 0

        gene_parallelism_statistics_syn = {}
        for gene_i, length_i in effective_gene_lengths_syn.items():
            gene_parallelism_statistics_syn[gene_i] = {}
            gene_parallelism_statistics_syn[gene_i]['length'] = length_i
            gene_parallelism_statistics_syn[gene_i]['observed'] = 0
            gene_parallelism_statistics_syn[gene_i]['multiplicity'] = 0

        # save number of mutations for multiplicity
        for locus_tag_i, locus_tag_i_dict in gene_count_dict.items():
            gene_parallelism_statistics[locus_tag_i]['observed'] = locus_tag_i_dict['n_mut']
            gene_parallelism_statistics[locus_tag_i]['mean_freq'] = np.mean(locus_tag_i_dict['freqs'])

        # same thing for synonymous
        for locus_tag_i, locus_tag_i_dict in gene_count_syn_dict.items():
            gene_parallelism_statistics_syn[locus_tag_i]['observed'] = locus_tag_i_dict['n_mut']
            gene_parallelism_statistics_syn[locus_tag_i]['mean_freq'] = np.mean(locus_tag_i_dict['freqs'])

        L_mean = np.mean(list(effective_gene_lengths.values()))
        L_tot = sum(list(effective_gene_lengths.values()))
        n_tot = sum([ x['n_mut'] for x in gene_count_dict.values() ])
        # don't include taxa with less than 20 mutations
        print("N_total = " + str(n_tot))
        if n_tot < n_nonsyn_min:
            continue
        # go back over and calculate multiplicity
        for locus_tag_i in gene_parallelism_statistics.keys():
            # double check the measurements from this
            gene_parallelism_statistics[locus_tag_i]['multiplicity'] = gene_parallelism_statistics[locus_tag_i]['observed'] *1.0/ effective_gene_lengths[locus_tag_i] * L_mean
            gene_parallelism_statistics[locus_tag_i]['expected'] = n_tot*gene_parallelism_statistics[locus_tag_i]['length']/L_tot

        # get multiplicity for synonymous mutations
        L_mean_syn = np.mean(list(effective_gene_lengths_syn.values()))
        L_tot_syn = sum(list(effective_gene_lengths_syn.values()))
        n_tot_syn = sum([ x['n_mut'] for x in gene_count_syn_dict.values() ])

        # go back over and calculate multiplicity
        for locus_tag_i in gene_parallelism_statistics_syn.keys():
            # double check the measurements from this
            gene_parallelism_statistics_syn[locus_tag_i]['multiplicity'] = gene_parallelism_statistics_syn[locus_tag_i]['observed'] *1.0/ effective_gene_lengths_syn[locus_tag_i] * L_mean_syn
            gene_parallelism_statistics_syn[locus_tag_i]['expected'] = n_tot_syn*gene_parallelism_statistics_syn[locus_tag_i]['length']/L_tot_syn

        pooled_multiplicities = np.array([gene_parallelism_statistics[gene_name]['multiplicity'] for gene_name in gene_parallelism_statistics.keys() if gene_parallelism_statistics[gene_name]['multiplicity'] >=1])
        pooled_multiplicities.sort()

        pooled_tupe_multiplicities = np.array([(gene_parallelism_statistics[gene_name]['multiplicity'], gene_parallelism_statistics[gene_name]['observed']) for gene_name in gene_parallelism_statistics.keys() if gene_parallelism_statistics[gene_name]['multiplicity'] >=1])
        pooled_tupe_multiplicities = sorted(pooled_tupe_multiplicities, key=lambda x: x[0])
        pooled_tupe_multiplicities_x = [i[0] for i in pooled_tupe_multiplicities]
        pooled_tupe_multiplicities_y = [i[1] for i in pooled_tupe_multiplicities]
        pooled_tupe_multiplicities_y = [sum(pooled_tupe_multiplicities_y[i:]) / sum(pooled_tupe_multiplicities_y) for i in range(len(pooled_tupe_multiplicities_y))]

        null_multiplicity_survival = lt.NullGeneMultiplicitySurvivalFunction.from_parallelism_statistics( gene_parallelism_statistics )
        #observed_ms_test, observed_multiplicity_survival_test = lt.calculate_unnormalized_survival_from_vector(pooled_multiplicities)
        null_multiplicity_survival_copy = null_multiplicity_survival(pooled_multiplicities)
        null_multiplicity_survival_copy = [sum(null_multiplicity_survival_copy[i:]) / sum(null_multiplicity_survival_copy) for i in range(len(null_multiplicity_survival_copy)) ]
        #threshold_idx = numpy.nonzero((null_multiplicity_survival(observed_ms)*1.0/observed_multiplicity_survival)<FDR)[0][0]
        mult_survival_dict = {'Mult': pooled_multiplicities, 'Obs_fract': pooled_tupe_multiplicities_y, 'Null_fract': null_multiplicity_survival_copy}
        mult_survival_df = pd.DataFrame(mult_survival_dict)
        mult_survival_df_out = lt.get_path() + '/data/breseq/mult_survival_curves/' + taxon + '.txt'
        mult_survival_df.to_csv(mult_survival_df_out, sep = '\t', index = True)

        # get likelihood score and null test
        observed_G, pvalue = lt.calculate_total_parallelism(gene_parallelism_statistics)
        G_score_list.append((taxon, observed_G, pvalue))
        print(observed_G, pvalue)
        if pvalue >= 0.05:
            continue
        # Give each gene a p-value, get distribution
        gene_logpvalues = lt.calculate_parallelism_logpvalues(gene_parallelism_statistics)
        pooled_pvalues = []
        for gene_name in gene_logpvalues.keys():
            if (gene_parallelism_statistics[gene_name]['observed']>= nmin) and (float(gene_logpvalues[gene_name]) >= 0):
                pooled_pvalues.append( gene_logpvalues[gene_name] )

        pooled_pvalues = np.array(pooled_pvalues)
        pooled_pvalues.sort()
        if len(pooled_pvalues) == 0:
            continue

        null_pvalue_survival = lt.NullGeneLogpSurvivalFunction.from_parallelism_statistics( gene_parallelism_statistics, nmin=nmin)
        observed_ps, observed_pvalue_survival = lt.calculate_unnormalized_survival_from_vector(pooled_pvalues, min_x=-4)
        # Pvalue version
        # remove negative minus log p values.
        neg_p_idx = np.where(observed_ps>=0)
        observed_ps_copy = observed_ps[neg_p_idx]
        observed_pvalue_survival_copy = observed_pvalue_survival[neg_p_idx]
        pvalue_pass_threshold = np.nonzero(null_pvalue_survival(observed_ps_copy)*1.0/observed_pvalue_survival_copy<FDR)[0]
        if len(pvalue_pass_threshold) == 0:
            continue
        threshold_idx = pvalue_pass_threshold[0]
        pstar = observed_ps_copy[threshold_idx] # lowest value where this is true
        num_significant = observed_pvalue_survival[threshold_idx]
        # make it log base 10
        logpvalues_dict = {'P_value': observed_ps/math.log(10), 'Obs_num': observed_pvalue_survival, 'Null_num': null_pvalue_survival(observed_ps)}
        logpvalues_df = pd.DataFrame(logpvalues_dict)
        logpvalues_df_out = lt.get_path() + '/data/breseq/logpvalues/' + taxon + '.txt'
        logpvalues_df.to_csv(logpvalues_df_out, sep = '\t', index = True)

        p_star_dict[taxon] = (num_significant, pstar/math.log(10))

        output_mult_gene_filename = lt.get_path() + '/data/breseq/mult_genes_nonsyn_sig/' + taxon + '.txt'
        output_mult_gene = open(output_mult_gene_filename,"w")
        output_mult_gene.write(",".join(["Gene", "Length", "Observed", "Expected", "Multiplicity", "-log10(P)"]))
        for gene_name in sorted(gene_parallelism_statistics, key=lambda x: gene_parallelism_statistics.get(x)['observed'],reverse=True):
            if gene_logpvalues[gene_name] >= pstar and gene_parallelism_statistics[gene_name]['observed']>=nmin:
                output_mult_gene.write("\n")
                # log base 10 transform the p-values here as well
                output_mult_gene.write("%s, %0.1f, %d, %0.2f, %0.2f, %g" % (gene_name, gene_parallelism_statistics[gene_name]['length'],  gene_parallelism_statistics[gene_name]['observed'], gene_parallelism_statistics[gene_name]['expected'], gene_parallelism_statistics[gene_name]['multiplicity'], abs(gene_logpvalues[gene_name])/math.log(10) ))
        output_mult_gene.close()

        output_mult_syn_filename = lt.get_path() + '/data/breseq/mult_genes_all/' + taxon + '.txt'
        output_mult_syn = open(output_mult_syn_filename,"w")
        output_mult_syn.write(",".join(["Gene", "mult", "mult_syn", "mean_freq", "mean_freq_syn"]))
        for locus_tag_i in gene_parallelism_statistics.keys():
            mult_i = gene_parallelism_statistics[locus_tag_i]['multiplicity']
            mult_i_syn = gene_parallelism_statistics_syn[locus_tag_i]['multiplicity']
            if (mult_i > 0) and (mult_i_syn > 0):
                freq_i = gene_parallelism_statistics[locus_tag_i]['mean_freq']
                freq_i_syn = gene_parallelism_statistics_syn[locus_tag_i]['mean_freq']
                output_mult_syn.write("\n")
                output_mult_syn.write("%s, %f, %f, %f, %f" % (locus_tag_i, mult_i,  mult_i_syn, freq_i, freq_i_syn))
        output_mult_syn.close()

    G_score_list_p_vales = [i[2] for i in G_score_list]
    reject, pvals_corrected, alphacSidak, alphacBonf = mt.multipletests(G_score_list_p_vales, alpha=0.05, method='fdr_bh')
    total_parallelism_path = lt.get_path() + '/data/breseq/total_parallelism.txt'
    total_parallelism = open(total_parallelism_path,"w")
    total_parallelism.write("\t".join(["Taxon", "G_score", "p_value", "p_value_BH"]))
    for i in range(len(pvals_corrected)):
        taxon_i = G_score_list[i][0]
        G_score_i = G_score_list[i][1]
        p_value_i = G_score_list[i][2]
        pvals_corrected_i = pvals_corrected[i]

        total_parallelism.write("\n")
        total_parallelism.write("\t".join([taxon_i, str(G_score_i), str(p_value_i), str(pvals_corrected_i)]))

    total_parallelism.close()
    with open(lt.get_path() + '/data/breseq/p_star.txt', 'wb') as file:
        file.write(pickle.dumps(p_star_dict)) # use `pickle.loads` to do the reverse

Beispiel #24

0

Datei anzeigen

Datei: calculate_half_life.py Projekt: wrshoemaker/LTDE

from scipy import stats
from scipy.stats import t
from scipy.integrate import odeint
from decimal import Decimal
import _pickle as pickle

#from sklearn.model_selection import GridSearchCV
#from sklearn.neighbors import KernelDensity
import statsmodels.stats.multitest as mt
import statsmodels.formula.api as smf

from Bio import SeqIO

from statsmodels.base.model import GenericLikelihoodModel

filepath = lt.get_path() + '/data/demography/weibull_results_clean.csv'

half_life_dict = {}

count = 0
for line in open(filepath, 'r'):

    if count == 0:
        count += 1
        continue

    line_split = line.strip().split(',')

    taxon = line_split[1]

    #print(line_split[3])

Beispiel #25

0

Datei anzeigen

Datei: piecewise_regression.py Projekt: wrshoemaker/LTDE

import os
import pwlf
import numpy as np
import pandas as pd
import ltde_tools as lt

df_colors = pd.read_csv(lt.get_path() + '/data/colors.csv', sep=',')


def piecewise_regression():
    df = pd.read_csv(
        os.path.expanduser("~/GitHub/LTDE") +
        '/data/demography/longtermdormancy_20190528_nocomments.csv',
        sep=',')
    # KBS0721 rep
    df['N'] = (df['Colonies'] + 1) * (1000 /
                                      df['Inoculum']) * (10**(df['Dilution']))

    df['Dormstart_date'] = pd.to_datetime(df['Dormstart_date'],
                                          format='%d-%b-%y')
    df['Firstread_date'] = pd.to_datetime(df['Firstread_date'],
                                          format='%d-%b-%y')
    df['Days'] = df['Firstread_date'].sub(df['Dormstart_date'], axis=0)
    df['Days'] = df['Days'].dt.days.astype('int')
    #sdf
    N_dead_list = []
    delta_slope_list = []
    time_split = []
    slope1_list = []
    slope2_list = []
    taxa = list(set(df.Strain.to_list()))

Beispiel #26

0

Datei anzeigen

Datei: piecewise_regression.py Projekt: wrshoemaker/LTDE

def piecewise_regression():
    df = pd.read_csv(
        os.path.expanduser("~/GitHub/LTDE") +
        '/data/demography/longtermdormancy_20190528_nocomments.csv',
        sep=',')
    # KBS0721 rep
    df['N'] = (df['Colonies'] + 1) * (1000 /
                                      df['Inoculum']) * (10**(df['Dilution']))

    df['Dormstart_date'] = pd.to_datetime(df['Dormstart_date'],
                                          format='%d-%b-%y')
    df['Firstread_date'] = pd.to_datetime(df['Firstread_date'],
                                          format='%d-%b-%y')
    df['Days'] = df['Firstread_date'].sub(df['Dormstart_date'], axis=0)
    df['Days'] = df['Days'].dt.days.astype('int')
    #sdf
    N_dead_list = []
    delta_slope_list = []
    time_split = []
    slope1_list = []
    slope2_list = []
    taxa = list(set(df.Strain.to_list()))
    #fig_all = plt.figure()
    flux_list = []
    slope2_scale_list = []

    df_out = open(lt.get_path() + '/data/demography/piecewise_regression.txt',
                  'w')
    df_out.write('\t'.join([
        'Species', 'rep', 'N0', 'slope1', 'slope2', 'time_split', 'N_split'
    ]) + '\n')

    for taxon in taxa:
        print(taxon)
        if taxon == 'KBS0714':
            continue
        if taxon == 'KBS0719':
            continue
        if taxon == 'KBS0711W':
            continue
        if taxon == 'KBS0816':
            continue
        if taxon == 'KBS0727':
            continue
        if taxon == 'KBS0704':
            continue

        #if taxon == 'KBS0725':
        #    continue
        #if taxon == 'KBS0702':
        #    continue
        #if taxon == 'KBS0712':
        #    continue
        #if taxon == 'KBS0703':
        #    continue
        #if taxon == 'KBS0706':
        #    continue

        taxon_color = df_colors.loc[df_colors['strain'] ==
                                    taxon].Color.to_list()[0]

        #if taxon != 'KBS0710':
        #    continue
        #fig = plt.figure()
        df_taxon = df[(df["Strain"] == taxon)]
        reps = list(set(df_taxon.Rep.to_list()))
        for rep in reps:
            df_taxon_rep = df_taxon[(df_taxon["Rep"] == rep)]
            df_taxon_rep.sort_values('Days')

            x = df_taxon_rep.Days.values
            if len(x) < 20:
                continue
            y = np.log10(df_taxon_rep.N.values)

            N0 = df_taxon_rep.N.values[0]

            my_pwlf = pwlf.PiecewiseLinFit(x, y)

            # fit the data for four line segments
            res = my_pwlf.fit(2)

            # predict for the determined points
            xHat = np.linspace(min(x), max(x), num=10000)
            yHat = my_pwlf.predict(xHat)

            #print(taxon, len(res))

            N_switch = my_pwlf.intercepts[0] + (my_pwlf.calc_slopes()[0] *
                                                res[1])

            #N_flux =

            slopes = my_pwlf.calc_slopes()
            N_dead = (10**max(y)) - (10**N_switch)
            N_dead_flux = ((10**max(y)) * slopes[0])
            #angle_switch = np.arctan(np.absolute( (slopes[1]-slopes[0]) /(1+ (slopes[0]*slopes[1])) ))
            delta_slope = (slopes[1] - slopes[0])
            #print(taxon, rep, N_switch, delta_slope)

            N_dead_list.append(N_dead / res[1])
            #N_dead_per_time_list.append(N_dead / )
            slope1_list.append(slopes[0])
            slope2_list.append(slopes[1])
            delta_slope_list.append(delta_slope)

            time_split.append(res[1])

            #taxon_color
            #print(delta_slope)
            #max(10**y) * slopes[0]

            #plt.scatter(np.log10(np.abs(N_dead_flux / res[1] ) ), np.log10(delta_slope), c = taxon_color)
            #plt.scatter(np.log10(np.abs(slopes[0]) ), np.log10(slopes[1]), c = taxon_color)
            flux = np.abs((10**my_pwlf.intercepts[0]) * slopes[0])
            flux_list.append(flux)

            #if taxon == 'KBS0812':
            #    print(flux, my_pwlf.intercepts[0], slopes[1])

            slope2_scale_list.append(slopes[1] - slopes[0])

            #df_out.write('\t'.join([ strain, str(round(weighted_mean_cov, 3)) ]) + '\n')

            df_out.write('\t'.join([
                taxon,
                str(rep),
                str(N0),
                str(slopes[0]),
                str(slopes[1]),
                str(res[1]),
                str(10**N_switch)
            ]) + '\n')

            #plt.scatter(np.log10(flux ), np.log10(slopes[1] ), c = taxon_color)

            #plt.scatter(slopes[0], slopes[1], c = taxon_color)

            #plt.scatter(x, y)
            #plt.plot(xHat, yHat, '-')

        #plt.xscale('log',basex=10)
        #plt.yscale('log',basey=10)
        #plt.xlim(10**-2,10**10)
        #plt.xlabel('Number of dead cells', fontsize = 12)
        #plt.ylabel('absolute value of second slope, log10', fontsize = 12)
        #fig.savefig(lt.get_path() + '/figs/taxon_piece/'+taxon+'.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600)
        #plt.close()

    df_out.close()

Beispiel #27

0

Datei anzeigen

import statsmodels.stats.multitest as mt
import statsmodels.formula.api as smf

from Bio import SeqIO

from statsmodels.base.model import GenericLikelihoodModel

# only plot taxa w/ significant g scores and at least 100 mutations

fig = plt.figure()
fig.subplots_adjust(hspace=0.35, wspace=0.35)
for i in range(0, len(lt.taxa_to_plot)):
    taxon = lt.taxa_to_plot[i]
    taxon_color = lt.df_colors.loc[lt.df_colors['strain'] ==
                                   taxon].Color.to_list()[0]
    df = pd.read_csv(lt.get_path() + '/data/breseq/mult_genes_all/' + taxon +
                     '.txt',
                     sep=',')

    ax = fig.add_subplot(3, 3, i + 1)
    x = df.mult_syn.values
    y = df.mult.values
    x_log10 = np.log10(x)
    y_log10 = np.log10(y)

    ax.scatter(x, y, c=taxon_color, marker = 'o', s = 70, \
        linewidth = 0.6, alpha = 0.5, zorder=1, edgecolors='none')
    min_range = min([min(x), min(y)]) * 0.5
    max_range = max([max(x), max(y)]) * 2
    ax.set_xlim([min_range, max_range])
    ax.set_ylim([min_range, max_range])

Beispiel #28

0

Datei anzeigen

Datei: plot_bacillus_aa.py Projekt: wrshoemaker/LTDE

import matplotlib.ticker
import datetime as dt

#from sklearn.model_selection import GridSearchCV
#from sklearn.neighbors import KernelDensity
import statsmodels.stats.multitest as mt
import statsmodels.formula.api as smf

from Bio import SeqIO

from statsmodels.base.model import GenericLikelihoodModel



df = pd.read_csv(lt.get_path() + '/data/mzML_Files_forR/Bacillus_AA_Conc_1000d.csv', sep = ',')
df = df.set_index('Name')
aa_list = ['Ala', 'Gly', 'Val', 'Leu', 'Ile', 'Pro', 'Met', 'Ser', 'Thr', 'Phe',
        'Asp', 'Glu', 'Orn', 'Lys', 'His', 'Tyr', 'Cys-Cys']
aa_dict = {'Ala':'Alanine', 'Gly':'Glycine', 'Val':'Valine', 'Leu':'Leucine',
            'Ile': 'Isoleucine', 'Pro':'Proline', 'Met':'Methionine',
            'Ser':'Serine', 'Thr':'Threonine', 'Phe':'Phenylalanine',
            'Asp':'Aspartic Acid', 'Glu':'Glutamic acid', 'Orn':'Arginine',
            'Lys':'Lysine', 'His':'Histidine', 'Tyr':'Tyrosine','Cys-Cys':'Cystine'}
molar_mass_dict = {'Ala':89.094, 'Gly':75.07, 'Val':117.2, 'Leu':113.2,
                    'Ile':113.2, 'Pro':115.1, 'Met':149.2, 'Ser':105.1,
                    'Thr':119.1, 'Phe':165.2, 'Asp':133.1, 'Glu':147.1,
                    'Orn':174.2, 'Lys':146.2, 'His':155.2, 'Tyr':181.2,
                    'Cys-Cys':240.1}

bio_reps = ['KBS0812A', 'KBS0812B', 'KBS0812C', 'KBS0812D']

Beispiel #29

0

Datei anzeigen

Datei: plot_dnds.py Projekt: wrshoemaker/LTDE

import matplotlib.lines as mlines

import matplotlib.ticker
import datetime as dt

#from sklearn.model_selection import GridSearchCV
#from sklearn.neighbors import KernelDensity
import statsmodels.stats.multitest as mt
import statsmodels.formula.api as smf

from Bio import SeqIO

from statsmodels.base.model import GenericLikelihoodModel

df_taxa = pd.read_csv(lt.get_path() + '/data/breseq/dN_dS_taxa.txt', sep='\t')
df_taxa = df_taxa.sort_values(by=['dN_dS_total'])
taxa_to_keep = df_taxa.Species.to_list()
df_taxa_samples = pd.read_csv(lt.get_path() +
                              '/data/breseq/genetic_diversity.txt',
                              sep='\t',
                              index_col=None)
fig = plt.figure()
for i, taxon in enumerate(taxa_to_keep):
    #print(taxon)
    #print(df_taxa_samples.loc[df_taxa_samples['Species'] == taxon])
    dn_ds = df_taxa_samples.loc[df_taxa_samples['Species'] ==
                                taxon].dn_ds_total.values
    dn_ds_mean = np.mean(dn_ds)
    #print(dn_ds)
    dn_ds_sem = np.std(dn_ds) / np.sqrt(len(dn_ds))

Beispiel #30

0

Datei anzeigen

Datei: make_figs.py Projekt: wrshoemaker/LTDE

def fig2():
    df = lt.get_mean_time_death()
    fig = plt.figure()
    # alpha kde
    alpha = df.alpha.values
    grid_alpha = GridSearchCV(KernelDensity(),
                              {'bandwidth': np.linspace(0.1, 10, 50)},
                              cv=20)  # 20-fold cross-validation
    grid_alpha.fit(alpha[:, None])
    x_grid_alpha = np.linspace(0, 2.5, 1000)
    kde_alpha = grid_alpha.best_estimator_
    pdf_alpha = np.exp(kde_alpha.score_samples(x_grid_alpha[:, None]))
    ax1 = plt.subplot2grid((2, 2), (0, 0), colspan=1)
    pdf_alpha = [x / sum(pdf_alpha) for x in pdf_alpha]
    ax1.plot(x_grid_alpha, pdf_alpha, alpha=0.8, lw=2,
             color='#1f77b4')  #, marker='o')
    ax1.axvline(x=1, color='darkgrey', linestyle='--', lw=2.5)
    ax1.axvline(x=np.mean(alpha), color='#1f77b4', linestyle='--', lw=2.5)
    ax1.set_xlim([-0.1, 2.6])
    ax1.set_xlabel("Scale parameter, " + r'$\alpha$', fontsize=14)
    ax1.set_ylabel("Probability density", fontsize=14)
    # half life kde
    half_life = np.log10(df.half_life.values)
    grid_half_life = GridSearchCV(KernelDensity(),
                                  {'bandwidth': np.linspace(0.1, 10, 50)},
                                  cv=20)  # 20-fold cross-validation
    grid_half_life.fit(half_life[:, None])
    x_grid_half_life = np.linspace(-10, 15, 1000)
    kde_half_life = grid_half_life.best_estimator_
    pdf_half_life = np.exp(
        kde_half_life.score_samples(x_grid_half_life[:, None]))
    pdf_half_life = [x / sum(pdf_half_life) for x in pdf_half_life]
    ax2 = plt.subplot2grid((2, 2), (1, 0), colspan=1)
    ax2.plot(x_grid_half_life, pdf_half_life, color="orange", alpha=0.8)
    ax2.axvline(x=np.mean(half_life), color='orange', linestyle='--', lw=2.5)
    ax2.set_xlim([-11, 11])
    ax2.set_xlabel("Half-life " + r'$\mathrm{(d^{-1}), \, log_{10}} $',
                   fontsize=14)
    ax2.set_ylabel("Probability density", fontsize=14)

    ax3 = plt.subplot2grid((2, 2), (0, 1), rowspan=2)
    strains = list(set(df.strain.values))
    half_lives = [
        np.log10(df.loc[df['strain'] == strain].half_life.values)
        for strain in strains
    ]
    mean_half_life = [
        np.median(np.log10(df.loc[df['strain'] == strain].half_life.values))
        for strain in strains
    ]

    zipped_half_lives = list(zip(strains, half_lives, mean_half_life))
    zipped_half_lives.sort(key=lambda x: int(x[2]))  # reverse= True)
    zipped_half_lives_sorted = sorted(zipped_half_lives, key=lambda x: x[2])
    strains = [x[0] for x in zipped_half_lives_sorted]
    half_lives = [x[1] for x in zipped_half_lives_sorted]
    mean_half_life = [x[2] for x in zipped_half_lives_sorted]

    strain_dict = lt.get_strain_genus_dict()
    genera_labels = list(reversed([strain_dict[x] for x in strains]))
    strain_labels = list(reversed([' sp. ' + x for x in strains]))
    #genera
    ax3.boxplot(half_lives, vert=False)
    ax3.yaxis.set_major_formatter(plt.NullFormatter())
    ax3.set_xlim([-6, 11])
    #ax3.yaxis.set_ticks_position('none')
    #ax3.gca().xaxis.set_major_locator(plt.NullLocator())
    #ax3.axis('off')
    ax3.set_xlabel("Half-life " + r'$\mathrm{(d^{-1}), \, log_{10}} $',
                   fontsize=14)
    for i in range(len(genera_labels)):
        genera_label = genera_labels[i]
        strain_label = strain_labels[i]
        y = len(genera_labels) - i - 0.1
        if i == 0:
            ax3.text(-5,
                     y,
                     r"${" + genera_label + "} \, \mathrm{" + strain_label +
                     "}$",
                     fontsize=5.5)
        else:
            if genera_label == 'Janthinobacterium':
                fontsize = 5.2
            else:
                fontsize = 5.5
            ax3.text(3.2,
                     y,
                     r"${" + genera_label + "} \, \mathrm{" + strain_label +
                     "}$",
                     fontsize=fontsize)
    plt.tight_layout()
    fig_name = lt.get_path() + '/figs/fig2.png'
    fig.savefig(fig_name, bbox_inches="tight", pad_inches=0.4, dpi=600)
    plt.close()