def load_polarizer_data(params, metric = 'nuc'):
    analysis_folder = test_flu.flu_analysis_folder
    # construct file mask and load files
    base_name, name_mod = test_flu.get_fname(params)
    fmask = analysis_folder+'_'.join([base_name, 'polarizer', metric])+'.dat'
    print "loading", fmask
    flist = glob.glob(fmask)
    predictions = []
    for fname in flist:
        year = int(fname.split('_')[len(analysis_folder.split('_'))+1])
        predictions.append((year, np.loadtxt(fname)))

    # sort predictions by year
    predictions.sort(key = lambda x:x[0])
    # make a list of all years for which we have prediction
    years = np.array([a[0] for a in predictions])

    # calculate averages over the 50 replicates done for each year and normalize to random picks
    mean_distance = np.array([a[1][:,0].mean() for a in predictions])
    minimal_distance = np.array([a[1][:,1].mean() for a in predictions])/mean_distance
    average_distances = np.array([a[1].mean(axis=0)/mean_distance[ai] for ai,a in enumerate(predictions)])

    # normalize the observed distances such that random=1 and best = 0
    normed_distances = ((average_distances[:,1:].T - minimal_distance)/(1-minimal_distance)).T

    return years, average_distances, normed_distances
def load_date_distribution(params, top_strain_method):
    '''
    returns the sampling dates of all predicted strains measured in days
    relative to Jan 1st of the year preceding the prediction year
    '''

    from datetime import date
    # construct file mask and load files
    analysis_folder = test_flu.flu_analysis_folder
    boost = params.boost
    # construct file mask and load files
    base_name, name_mod = test_flu.get_fname(params)
    fmask = analysis_folder+'_'.join([base_name, name_mod, top_strain_method, 'topstrains.dat'])
    flist = glob.glob(fmask)
    sampling_dates = {}
    for fname in flist:
        tmp_dates = []
        year = int(fname.split('_')[len(analysis_folder.split('_'))+1])
        base_line = date(year-1,1,1).toordinal()
        with open(fname, 'r') as infile:
            for line in infile:
                strain_year, strain_month, strain_day = map(int, line.split()[-1].split('-'))
                tmp_dates.append(date(strain_year, strain_month,strain_day).toordinal()-base_line)
        sampling_dates[year] = tmp_dates
    return sampling_dates
def load_polarizer_data(params, metric='nuc'):
    analysis_folder = test_flu.flu_analysis_folder
    # construct file mask and load files
    base_name, name_mod = test_flu.get_fname(params)
    fmask = analysis_folder + '_'.join([base_name, 'polarizer', metric
                                        ]) + '.dat'
    print "loading", fmask
    flist = glob.glob(fmask)
    predictions = []
    for fname in flist:
        year = int(fname.split('_')[len(analysis_folder.split('_')) + 1])
        predictions.append((year, np.loadtxt(fname)))

    # sort predictions by year
    predictions.sort(key=lambda x: x[0])
    # make a list of all years for which we have prediction
    years = np.array([a[0] for a in predictions])

    # calculate averages over the 50 replicates done for each year and normalize to random picks
    mean_distance = np.array([a[1][:, 0].mean() for a in predictions])
    minimal_distance = np.array([a[1][:, 1].mean()
                                 for a in predictions]) / mean_distance
    average_distances = np.array([
        a[1].mean(axis=0) / mean_distance[ai]
        for ai, a in enumerate(predictions)
    ])

    # normalize the observed distances such that random=1 and best = 0
    normed_distances = ((average_distances[:, 1:].T - minimal_distance) /
                        (1 - minimal_distance)).T

    return years, average_distances, normed_distances
def load_date_distribution(params, top_strain_method):
    '''
    returns the sampling dates of all predicted strains measured in days
    relative to Jan 1st of the year preceding the prediction year
    '''

    from datetime import date
    # construct file mask and load files
    analysis_folder = test_flu.flu_analysis_folder
    boost = params.boost
    # construct file mask and load files
    base_name, name_mod = test_flu.get_fname(params)
    fmask = analysis_folder + '_'.join(
        [base_name, name_mod, top_strain_method, 'topstrains.dat'])
    flist = glob.glob(fmask)
    sampling_dates = {}
    for fname in flist:
        tmp_dates = []
        year = int(fname.split('_')[len(analysis_folder.split('_')) + 1])
        base_line = date(year - 1, 1, 1).toordinal()
        with open(fname, 'r') as infile:
            for line in infile:
                strain_year, strain_month, strain_day = map(
                    int,
                    line.split()[-1].split('-'))
                tmp_dates.append(
                    date(strain_year, strain_month, strain_day).toordinal() -
                    base_line)
        sampling_dates[year] = tmp_dates
    return sampling_dates
def load_prediction_data(params, metric='nuc'):
    analysis_folder = test_flu.flu_analysis_folder
    boost = params.boost
    # construct file mask and load files
    base_name, name_mod = test_flu.get_fname(params)
    fmask = analysis_folder + '_'.join([base_name, name_mod, metric]) + '.dat'
    print "loading", fmask
    flist = glob.glob(fmask)
    predictions = []
    for fname in flist:
        year = int(fname.split('_')[len(analysis_folder.split('_')) + 1])
        predictions.append((year, np.loadtxt(fname)))

    # sort predictions by year
    predictions.sort(key=lambda x: x[0])
    # make a list of all years for which we have prediction
    years = np.array([a[0] for a in predictions])

    # calculate averages over the 50 replicates done for each year and normalize to random picks
    average_distance = np.array([a[1][:, 0].mean() for a in predictions])
    minimal_distance = np.array([a[1][:, 1].mean()
                                 for a in predictions]) / average_distance
    prediction_distances = {
        (method, boost, label):
        np.array([a[1][:, methodi].mean()
                  for a in predictions]) / average_distance
        for methodi, method, label in methods
    }
    prediction_distances[('average', boost, 'average')] = average_distance
    prediction_distances[('minimal', boost, 'minimal')] = minimal_distance

    # normalize the observed distances to
    normed_distances = {
        method:
        (np.mean((preds - minimal_distance) / (1 - minimal_distance)),
         boot_strap((preds - minimal_distance) / (1 - minimal_distance), 1000))
        for method, preds in prediction_distances.iteritems()
    }

    # the L&L predictions sit in column 2
    prediction_distances[(
        'L&L', boost,
        'L\&L')] = np.array([a[1][:, 2].mean()
                             for a in predictions]) / average_distance

    normed_distances[('L&L', boost, 'L\&L')] = (
        np.mean((
            (prediction_distances[('L&L', boost, 'L\&L')] - minimal_distance) /
            (1 - minimal_distance))[laessig_years(years)]),
        boot_strap((
            (prediction_distances[('L&L', boost, 'L\&L')] - minimal_distance) /
            (1 - minimal_distance))[laessig_years(years)]))

    return years, prediction_distances, normed_distances
def load_prediction_data(params, metric = 'nuc'):
    analysis_folder = test_flu.flu_analysis_folder
    boost = params.boost
    # construct file mask and load files
    base_name, name_mod = test_flu.get_fname(params)
    fmask = analysis_folder+'_'.join([base_name, name_mod, metric])+'.dat'
    print "loading", fmask
    flist = glob.glob(fmask)
    predictions = []
    for fname in flist:
        year = int(fname.split('_')[len(analysis_folder.split('_'))+1])
        predictions.append((year, np.loadtxt(fname)))

    # sort predictions by year
    predictions.sort(key = lambda x:x[0])
    # make a list of all years for which we have prediction
    years = np.array([a[0] for a in predictions])

    # calculate averages over the 50 replicates done for each year and normalize to random picks
    average_distance = np.array([a[1][:,0].mean()       for a in predictions])
    minimal_distance = np.array([a[1][:,1].mean()       for a in predictions])/average_distance
    prediction_distances = {(method,boost,label):np.array([a[1][:,methodi].mean()
                            for a in predictions])/average_distance
                            for methodi, method,label in methods}
    prediction_distances[('average',boost,'average')] = average_distance
    prediction_distances[('minimal',boost,'minimal')] = minimal_distance

    # normalize the observed distances to 
    normed_distances = {method:(np.mean((preds - minimal_distance)/(1-minimal_distance)),
                                boot_strap((preds - minimal_distance)/(1-minimal_distance),1000))
                       for method,preds in prediction_distances.iteritems()}


    # the L&L predictions sit in column 2
    prediction_distances[('L&L',boost,'L\&L')] = np.array([a[1][:,2].mean() for a in predictions])/average_distance

    normed_distances[('L&L',boost,'L\&L')] = (np.mean(((prediction_distances[('L&L', boost,'L\&L')] - minimal_distance)/(1-minimal_distance))[laessig_years(years)]),
                                      boot_strap(((prediction_distances[('L&L', boost,'L\&L')] - minimal_distance)/(1-minimal_distance))[laessig_years(years)])) 

    return years, prediction_distances, normed_distances
Example #7
0
import tree_utils
import numpy as np
from scipy import stats
import glob, pickle, gzip, os, argparse
from datetime import date

analysis_folder = test_flu.flu_analysis_folder
# parse the commandline arguments
parser = test_flu.make_flu_parser()
params = parser.parse_args()
params.pred = params.pred.replace('^', ' ')
params.test = params.test.replace('^', ' ')
params.subsample = 0.7

# get run specific file names
fname_base, name_mod = test_flu.get_fname(params)

top_strain_method = 'mean_fitness'
# allocate arrays to save the predictions
nuc_dist_array = np.zeros((params.nreps, 12))
epi_dist_array = np.zeros((params.nreps, 12))
top_strains = []
for ii in xrange(params.nreps):
    # set up the prediction and pass all parameters to the wrapper function
    prediction = test_flu.predict_params([
        'mean_fitness', 'expansion_score', 'depth', 'polarizer',
        flu.combined_ranking_internal, flu.combined_ranking_external
    ], params)

    # define the methodes for which the predictions are to be evaluated
    methods = [('mean_fitness', '_ext', prediction.terminals),
from matplotlib import pyplot as plt
import numpy as np
from scipy import stats
import glob,pickle,gzip,os,argparse
from datetime import date

plt.rcParams.update(test_flu.mpl_params)

tree_figure_folder = '../figures_trees/'
analysis_folder = test_flu.flu_analysis_folder
# parse the commandline arguments
parser = test_flu.make_flu_parser()
parser.add_argument('--tau', default = 1.0, type = float, help= 'memory time scale of the tree polarizer')
params=parser.parse_args()
# get name snippets to link output files to run parameters
base_name, name_mod = test_flu.get_fname(params)
params.gamma=1.0
params.diffusion=1.0

# set up the prediction and pass all parameters to the wrapper function
prediction = test_flu.predict_params(['polarizer'], params)
prediction.calculate_polarizers(params.tau)

# define the methodes for which the predictions are to be evaluated
methods = [ ('polarizer', '_ext', prediction.terminals),
            ('polarizer', '_int', prediction.non_terminals)]
distances, distances_epi, test_data = test_flu.evaluate(prediction, methods, params)

# calculate the fitness differentials for each internal branch and associate with 
# different types of mutations that happen on these branches
dfit = []