def add_epi_dates(df):
    '''
    Adds epi_week and epi_year to dataframe.
    '''
    df['epi_week'] = df.date.apply(lambda x: Week.fromdate(x).week)
    df['epi_year'] = df.date.apply(lambda x: Week.fromdate(x).year)

    df = df[['epi_week', 'epi_year', 'date', 'location', 'location_name', 
             'cum_death', 'inc_death', 'cum_case', 'inc_case']]
    return df
Exemple #2
0
 def get_newunit(value):
     if value[0].isdecimal():
         date = pd.to_datetime(value)
         if unit == 'week':
             epiweek = str(Week.fromdate(date, system="cdc")) # get epiweeks
             year, week = epiweek[:4], epiweek[-2:]
             if weekasdate in ['start', 'end']:
                 if weekasdate == 'start':
                     epiweek = str(Week(int(year), int(week)).startdate())
                 else:
                     epiweek = str(Week(int(year), int(week)).enddate())
             else:
                 epiweek = year + '_' + 'EW' + week
             if epiweek not in time_cols:
                 time_cols.append(epiweek)
             return epiweek
         elif unit == 'month':
             year_month = date.strftime("%Y-%m")
             if year_month not in time_cols:
                 time_cols.append(year_month)
             return year_month
         elif unit == 'year':
             year = date.strftime("%Y")
             if year not in time_cols:
                 time_cols.append(year)
             return year
         elif unit == 'full':
             return 'total'
     else:
         if unit == 'full':
             return 'total'
         else:
             return value
Exemple #3
0
def _date_to_api_string(date: date, time_type: str = "day") -> str:  # pylint: disable=W0621
    """Convert a date object to a YYYYMMDD or YYYYMM string expected by the API."""
    if time_type == "day":
        date_str = date.strftime("%Y%m%d")
    elif time_type == "week":
        date_str = Week.fromdate(date).cdcformat()
    return date_str
Exemple #4
0
def export_csv(df, geo_name, sensor, export_dir, start_date):
    """Export data set in format expected for injestion by the API.

    Parameters
    ----------
    df: pd.DataFrame
        data frame with columns "geo_id", "timestamp", and "val"
    geo_name: str
        name of the geographic region, such as "state" or "hrr"
    sensor: str
        name of the sensor; only used for naming the output file
    export_dir: str
        path to location where the output CSV files to be uploaded should be stored
    start_date: datetime.datetime
        The first date to report
    end_date: datetime.datetime
        The last date to report
    """
    df = df.copy()
    df = df[df["timestamp"] >= start_date]

    for date in df["timestamp"].unique():
        t = Week.fromdate(pd.to_datetime(str(date)))
        date_short = "weekly_" + str(t.year) + str(t.week).zfill(2)
        export_fn = f"{date_short}_{geo_name}_{sensor}.csv"
        result_df = df[df["timestamp"] == date][[
            "geo_id", "val", "se", "sample_size"
        ]]
        result_df.to_csv(f"{export_dir}/{export_fn}",
                         index=False,
                         float_format="%.8f")
Exemple #5
0
    def parse_cities_request(self, response):
        cities = json.loads(response.body)

        today = date_utils.today()
        current_week = Week.fromdate(today)

        # We have to do different passes for 2019 and 2020, since the specific days of
        # the epidemiological week differs.
        #
        # The api seems to return the data from the current year as "2020", and the previous as "2019",
        # so we'll exploit that to extract the data only from the "2020" chart

        for city in cities:
            for year in [2020, 2019]:
                for weeknum in range(1, current_week.week):
                    ep_week = Week(year, weeknum)

                    # Cache more than 4 weeks ago
                    should_cache = (current_week.week - weeknum) > 4
                    yield self.make_registral_request(
                        city=city,
                        ep_week=ep_week,
                        callback=self.parse_registral_request,
                        dont_cache=not should_cache,
                    )
Exemple #6
0
def plot_cummulative_sampling_fraction( df ):
    df["epiweek"] = df["date"].apply( lambda x: Week.fromdate(x).startdate() )
    plot_df = df.groupby( "epiweek" ).agg( new_cases = ("new_cases", "sum"), new_sequences = ("new_sequences", "sum" ) )
    plot_df = plot_df.loc[plot_df["new_sequences"]>0]
    plot_df["fraction"] = plot_df["new_sequences"] / plot_df["new_cases"]
    plot_df = plot_df.reset_index()

    fig = go.Figure()
    fig.add_trace( go.Scattergl( x=plot_df["epiweek"], y=plot_df["fraction"],
                                 mode='lines',
                                 name='Fraction',
                                 line={ "color" : '#767676', "width" : 4 } ) )

    _add_date_formating( fig )

    fig.update_layout( yaxis_tickformat='.1%' )

    cleaned_array = np.log10( plot_df.loc[plot_df["fraction"] > 0, "fraction"] )
    cleaned_array = cleaned_array[~np.isinf( cleaned_array )]

    min_lim = np.floor( cleaned_array.min() )
    max_lim = np.ceil( cleaned_array.max() )

    fig.update_yaxes( type="log", title="<b>Cases sequenced (%)</b>" )
    fig.update_xaxes( range=get_date_limits( plot_df["epiweek"] ) )

    return fig
def parse_filtered_metadata(metadata_file, tip_to_tree, label_fields, tree_fields, table_fields, database_date_column):
    
    query_dict = {}
    query_id_dict = {}

    closest_seqs = set()

    tree_to_tip = defaultdict(list)

    with open(metadata_file, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        headers = reader.fieldnames   

    with open(metadata_file, "r", encoding="utf-8") as f:
        in_data = csv.DictReader(f)
        for sequence in in_data:
            
            country = sequence["country"]
            query_id = sequence['query_id']
            query_name = sequence['query']
            closest_name = sequence["closest"]
            
            sample_date = sequence[database_date_column] #this may need to be flexible if using a different background database

            closest_distance = sequence["SNPdistance"]
            snps = sequence['SNPs']

            if query_id not in query_id_dict: #it's in the fasta file and in the db, this should take the db
               
                new_taxon = taxon(query_name, country, label_fields, tree_fields, table_fields)

                new_taxon.query_id = query_id

                if query_name == closest_name: #if it's in database, get its sample date
                    new_taxon.in_db = True
                    new_taxon.sample_date = sample_date
                    new_taxon.epiweek = Week.fromdate(convert_date(sample_date))
                    new_taxon.closest = "NA"
                else:
                    new_taxon.closest = closest_name
                    new_taxon.closest_distance = closest_distance
                    new_taxon.snps = snps
                    closest_seqs.add(closest_name)
                    
                if query_name in tip_to_tree:
                    relevant_tree = tip_to_tree[query_name]
                else:
                    relevant_tree = "NA"
                new_taxon.tree = relevant_tree

                tree_to_tip[relevant_tree].append(new_taxon)
            
                query_dict[query_name] = new_taxon
                query_id_dict[query_id] = new_taxon
            
    return query_dict, query_id_dict, tree_to_tip, closest_seqs
Exemple #8
0
 def get_epiweeks(value):
     if value[0].isdecimal():
         date = pd.to_datetime(value)
         epiweek = str(Week.fromdate(date, system="cdc"))  # get epiweeks
         epiweek = epiweek[:4] + '_' + 'EW' + epiweek[-2:]
         if epiweek not in ew_cols:
             ew_cols.append(epiweek)
         return epiweek
     else:
         return value
Exemple #9
0
 def addEpiWeek(self):
     if 'epiweek' in self.data:
         self.data.epiweek = self.data.epiweek.astype(str)
         return self
     from epiweeks import Week, Year
     epiweeks = []
     for dt in self.data.date:
         yr, mnth, day = dt.year, dt.month, dt.day
         epiweek = Week.fromdate(yr, mnth, day)
         epiweeks.append("{:04d}{:02d}".format(epiweek.year, epiweek.week))
     self.data['epiweek'] = epiweeks
     return self
Exemple #10
0
    def get_week_just_from_date(self, date):
        year_date = date.year
        year = year_date

        if date >= self.dates[year_date] and date < self.dates[year_date + 1]:
            year = year_date
        else:
            year = year_date + 1

        leap = self.get_leap(year)
        epi_date = date + timedelta(days=leap.days)
        epi_week = Week.fromdate(epi_date)
        return epi_week.week
Exemple #11
0
def regulation_release(state, grid, config, parameters, current_time):
    # compute the expected monthly release based on Biemans (2011)

    # TODO this is still written assuming monthly, but here's the epiweek for when that is relevant
    epiweek = Week.fromdate(current_time).week
    month = current_time.month
    streamflow_time_name = config.get(
        'water_management.reservoirs.streamflow_time_resolution')

    # initialize to the average flow
    state.reservoir_release = grid.reservoir_streamflow_schedule.mean(
        dim=streamflow_time_name).values

    # TODO what is k
    k = state.reservoir_storage_operation_year_start / (
        parameters.reservoir_regulation_release_parameter *
        grid.reservoir_storage_capacity)

    # TODO what is factor
    factor = np.where(
        grid.reservoir_runoff_capacity >
        parameters.reservoir_runoff_capacity_condition,
        (2.0 / grid.reservoir_runoff_capacity)**2.0, 0)

    # release is some combination of prerelease, average flow in the time period, and total average flow
    state.reservoir_release = np.where(
        (grid.reservoir_use_electricity > 0) |
        (grid.reservoir_use_irrigation > 0),
        np.where(
            grid.reservoir_runoff_capacity <= 2.0,
            k *
            grid.reservoir_prerelease_schedule.sel({
                streamflow_time_name: month
            }).values,
            k * factor *
            grid.reservoir_prerelease_schedule.sel({
                streamflow_time_name: month
            }).values + (1 - factor) * grid.reservoir_streamflow_schedule.sel({
                streamflow_time_name:
                month
            }).values),
        np.where(
            grid.reservoir_runoff_capacity <= 2.0,
            k * grid.reservoir_streamflow_schedule.mean(
                dim=streamflow_time_name).values,
            k * factor * grid.reservoir_streamflow_schedule.mean(
                dim=streamflow_time_name).values +
            (1 - factor) * grid.reservoir_streamflow_schedule.sel({
                streamflow_time_name:
                month
            }).values))
Exemple #12
0
def reservoir_release(state, grid, config, parameters, current_time):
    # compute release from reservoirs

    # TODO so much logic was dependent on monthly, so still assuming monthly for now, but here's the epiweek for when that is relevant
    epiweek = Week.fromdate(current_time).week
    month = current_time.month

    # if it's the start of the operational year for the reservoir, set it's start of op year storage to the current storage
    state.reservoir_storage_operation_year_start = np.where(
        state.reservoir_month_start_operations == month,
        state.reservoir_storage, state.reservoir_storage_operation_year_start)

    regulation_release(state, grid, config, parameters, current_time)

    storage_targets(state, grid, config, parameters, current_time)
Exemple #13
0
def make_objects(metadata_file):

    #epiweeks = time.make_epiweeks()

    lineage_objects = []
    taxa = []
    tax_dict = {}
    tax_with_dates = []
    lineages_to_taxa = defaultdict(list)
    lin_obj_dict = {}

    with open(metadata_file) as f:
        next(f)
        for l in f:

            toks = l.strip("\n").split(",")

            tax_name = toks[0]
            country = toks[1]
            date = toks[3]
            epiweek = toks[4]
            lin_string = toks[5]

            metadata = [country, date, epiweek]

            new_taxon = classes.taxon(tax_name, lin_string, metadata)
            taxa.append(new_taxon)
            if new_taxon.date_dt != "NA":
                tax_with_dates.append(new_taxon)

            tax_dict[tax_name] = new_taxon

            lineages_to_taxa[lin_string].append(new_taxon)

    current_date = sorted(tax_with_dates, key=sortkey2,
                          reverse=True)[0].date_dt
    current_week = Week.fromdate(current_date)

    for lin, lin_specific_taxa in lineages_to_taxa.items():
        l_o = classes.lineage(lin, lin_specific_taxa, current_date,
                              current_week)

        lin_obj_dict[lin] = l_o

    lin_obj_dict = parse_travel_history(lin_obj_dict, tax_dict, metadata_file)

    return lin_obj_dict, taxa, current_date
Exemple #14
0
def cumulative_seqs_over_time(figdir, locations_to_dates, lineage):

    dates = []
    epiweek_lst = []

    for k, v in locations_to_dates.items():
        dates.extend(v)

    date_counts = Counter(dates)

    seq_number = 0
    cum_counts = {}
    for date, value in sorted(date_counts.items()):
        seq_number = seq_number + value
        cum_counts[date] = seq_number

    for i in dates:
        epiweek_lst.append(Week.fromdate(i).startdate())

    epiweek_counts = Counter(epiweek_lst)
    sorted_epiweeks = OrderedDict(sorted(epiweek_counts.items()))

    fig, ax1 = plt.subplots(1, 1, figsize=(12, 4))

    ax1.bar(list(sorted_epiweeks.keys()),
            list(sorted_epiweeks.values()),
            color="#86b0a6",
            width=5)
    ax2 = ax1.twinx()
    ax2.plot(list(cum_counts.keys()),
             list(cum_counts.values()),
             linewidth=3,
             color="dimgrey")
    # ylims = (0,4000)
    ax1.spines['top'].set_visible(False)
    ax2.spines['top'].set_visible(False)

    ax1.xaxis.set_tick_params(rotation=90)
    ax1.set_xlabel("Date")
    ax2.set_ylabel("Total")
    ax1.set_ylabel("Sequence count")
    # ax2.set_ylim(ylims)

    plt.savefig(os.path.join(
        figdir, f"Cumulative_sequence_count_over_time_{lineage}.svg"),
                format='svg',
                bbox_inches='tight')
Exemple #15
0
def map_to_week(df, date_column='date_today', groupby_target=None):
    """
    map date_today to week_id
    :param df: dataframe
    :type df: pandas.DataFrame
    :param date_column: column name related to date_today
    :type date_column: str
    :param groupby_target: group by date_today and sum over thee groupby_target
    :type groupby_target: None or str or list
    :return: dataframe with week_id
    :rtype: pandas.DataFrame
    """
    df[date_column] = df[date_column].apply(lambda x: Week.fromdate(x).enddate() if pd.notna(x) else x)
    df[date_column] = pd.to_datetime(df[date_column])
    if groupby_target is not None:
        df = df.groupby('date_today', as_index=False)[groupby_target].sum()
    return df
Exemple #16
0
def date_string_to_epi_day(date_string):
    """
    parse a date string in YYYY-MM-DD format and return
    cumulative epi day which is cumulative total days since 2019-12-22
    """
    try:
        date = datetime.strptime(date_string, '%Y-%m-%d').date()
    except:
        return ""
    # this is epi-week week:
    week = Week.fromdate(date)
    # this is day 1 of epi-week 0:
    day_one = datetime.strptime("2019-12-22", '%Y-%m-%d').date()
    if week.year < 2019 or (week.year == 2019 and week.week < 52):
        return ""
    else:
        cum_epi_day = (date - day_one).days + 1
        return str(cum_epi_day)
Exemple #17
0
def collate_diffs(encoder, regions, typos, mask):
    """
    Stream output from encode_diffs to collate the incidence of each
    genetic difference by location and date, and return as a tabular
    data set.
    :param encoder:  generator, returned by encode_diffs()
    :param regions:  dict, counts keyed by region, country and collection date
    """
    res = {}
    for qname, diffs, missing in filter_outliers(encoder):
        region, country, coldate = parse_header(qname, regions, typos)
        if coldate is None:
            continue

        coldate = parse_date(coldate)
        epiweek = Week.fromdate(coldate).week
        year = coldate.year
        yeek = '{}|{:02d}'.format(year, epiweek)

        # update nested dictionaries
        if region not in res:
            res.update({region: {}})
        if country not in res[region]:
            res[region].update({country: {}})
        if yeek not in res[region][country]:
            res[region][country].update({yeek: {}})

        # iterate through genetic differences in this genome
        branch = res[region][country][yeek]  # shorthand
        for diff in diffs:
            typ, pos, alt = diff
            if typ == '~' and int(pos) in mask and alt in mask[pos]['alt']:
                # masked substitution
                continue
            if typ != '-' and 'N' in alt:
                # ignore substitutions and insertions with uncalled bases
                continue
            key = '|'.join(map(str, diff))
            if key not in branch:
                branch.update({key: 0})
            branch[key] += 1

    return res
Exemple #18
0
def date_string_to_epi_week(date_string):
    """
    parse a date string in YYYY-MM-DD format and return
    cumulative epi week which is cumulative total epidemiological
    weeks since 2019-12-22. Week beginning 2019-12-22 is week 0
    """
    try:
        date = datetime.strptime(date_string, '%Y-%m-%d').date()
    except:
        return ""
    # this is epi-week:
    week = Week.fromdate(date)
    if week.year < 2019 or (week.year == 2019 and week.week < 52):
        return ""
    elif week.year == 2019:
        return ("0")
    else:
        cum_epi_week = week.week + len(
            list(
                chain(*[[x for x in Year(y).iterweeks()]
                        for y in range(2020, week.year)])))
        return str(cum_epi_week)
Exemple #19
0
def generate_week_periods(open_future_periods, page_limit, begin_period, direction, direction_change):
    weeks_to_display = {}

    # When the user first visits the period screen the begin_period variable is empty.
    # Therefore, use the current week as default.
    week = Week.thisweek("iso") + open_future_periods

    # If begin_period variable has a date, use it to calculate the weeks to display.
    if begin_period != '':
        week = Week.fromdate(datetime.datetime.strptime(begin_period, '%Y-%m-%d'), 'iso')
        # This logic is to fix week discrepancy when a user clicks + and changes the direction and press - or vice versa
        if direction_change:
            if direction == '+':
                week += page_limit - 1
            if direction == '-':
                week -= page_limit - 1

    # We should not open future dates for data entry. The -1 is to prevent from opening this week.
    if direction == '+' and week + page_limit > Week.thisweek("iso") + open_future_periods:
        week = Week.thisweek("iso") + open_future_periods - page_limit - 1

    rng = range(page_limit, 0, -1) if direction == '+' else range(page_limit)

    for key, i in enumerate(rng):
        w = week + i if direction == '+' else week - (i + 1)
        weeks_to_display[str(key + 1)] = {
            "period": w.isoformat(),
            "display": "W{} - {} - {}".format(w.weektuple()[1], w.startdate(), w.enddate())
        }

        # Take the first week to calculate the beginning period in the next screen.
        if direction == '+' and i == page_limit:
            begin_period = str(w.enddate())
        # Take the final week to calculate the beginning week in the next screen.
        if direction == '-' and i == page_limit - 1:
            begin_period = str(w.startdate())

    return begin_period, weeks_to_display
Exemple #20
0
 def get_period_from_date(self, year, date):
     leap = self.get_leap(year)
     epi_date = date + timedelta(days=leap.days)
     epi_week = Week.fromdate(epi_date)
     week = epi_week.week
     return math.ceil(week / 3)
Exemple #21
0
def storage_targets(state: State, grid: Grid, config: Benedict,
                    parameters: Parameters, current_time: datetime) -> None:
    """Define the necessary drop in storage based on the reservoir storage targets at the start of the month.

    Args:
        state (State): the model state
        grid (Grid): the model grid
        config (Config): the model configuration
        parameters (Parameters): the model parameters
        current_time (datetime): the current simulation time
    """

    # TODO the logic here is really hard to follow... can it be simplified or made more readable?

    # TODO this is still written assuming monthly, but here's the epiweek for when that is relevant
    epiweek = Week.fromdate(current_time).week
    month = current_time.month
    streamflow_time_name = config.get(
        'water_management.reservoirs.streamflow_time_resolution')

    # if flood control active and has a flood control start
    flood_control_condition = (grid.reservoir_use_flood_control > 0) & (
        state.reservoir_month_flood_control_start > 0)
    # modify release in order to maintain a certain storage level
    month_condition = state.reservoir_month_flood_control_start <= state.reservoir_month_flood_control_end
    total_condition = flood_control_condition & (
        (month_condition &
         (month >= state.reservoir_month_flood_control_start) &
         (month < state.reservoir_month_flood_control_end)) |
        (np.logical_not(month_condition) &
         (month >= state.reservoir_month_flood_control_start) |
         (month < state.reservoir_month_flood_control_end)))
    drop = 0 * state.reservoir_month_flood_control_start
    n_month = 0 * drop
    for m in np.arange(1, 13):  # TODO assumes monthly
        m_and_condition = (m >= state.reservoir_month_flood_control_start) & (
            m < state.reservoir_month_flood_control_end)
        m_or_condition = (m >= state.reservoir_month_flood_control_start) | (
            m < state.reservoir_month_flood_control_end)
        drop = np.where(
            (month_condition & m_and_condition) |
            (np.logical_not(month_condition) & m_or_condition),
            np.where(
                grid.reservoir_streamflow_schedule.sel({
                    streamflow_time_name: m
                }).values >= grid.reservoir_streamflow_schedule.mean(
                    dim=streamflow_time_name).values, drop + 0, drop + np.abs(
                        grid.reservoir_streamflow_schedule.mean(
                            dim=streamflow_time_name).values -
                        grid.reservoir_streamflow_schedule.sel({
                            streamflow_time_name:
                            m
                        }).values)), drop)
        n_month = np.where((month_condition & m_and_condition) |
                           (np.logical_not(month_condition) & m_or_condition),
                           n_month + 1, n_month)
    state.reservoir_release = np.where(
        total_condition & (n_month > 0),
        state.reservoir_release + drop / n_month, state.reservoir_release)
    # now need to make sure it will fill up but issue with spilling in certain hydro-climate conditions
    month_condition = state.reservoir_month_flood_control_end <= state.reservoir_month_start_operations
    first_condition = flood_control_condition & month_condition & (
        (month >= state.reservoir_month_flood_control_end) &
        (month < state.reservoir_month_start_operations))
    second_condition = flood_control_condition & np.logical_not(
        month_condition) & (
            (month >= state.reservoir_month_flood_control_end) |
            (month < state.reservoir_month_start_operations))
    # TODO this logic exists in fortran mosart but isn't used...
    # fill = 0 * drop
    # n_month = 0 * drop
    # for m in np.arange(1,13): # TODO assumes monthly
    #     m_condition = (m >= self.state.reservoir_month_flood_control_end.values) &
    #         (self.reservoir_streamflow_schedule.sel({streamflow_time_name: m}).values > self.reservoir_streamflow_schedule.mean(dim=streamflow_time_name).values) & (
    #             (first_condition & (m <= self.state.reservoir_month_start_operations)) |
    #             (second_condition & (m <= 12))
    #         )
    #     fill = np.where(
    #         m_condition,
    #         fill + np.abs(self.reservoir_streamflow_schedule.mean(dim=streamflow_time_name).values - self.reservoir_streamflow_schedule.sel({streamflow_time_name: m}).values),
    #         fill
    #     )
    #     n_month = np.where(
    #         m_condition,
    #         n_month + 1,
    #         n_month
    #     )
    state.reservoir_release = np.where(
        (state.reservoir_release > grid.reservoir_streamflow_schedule.mean(
            dim=streamflow_time_name).values) &
        (first_condition | second_condition),
        grid.reservoir_streamflow_schedule.mean(
            dim=streamflow_time_name).values, state.reservoir_release)
 def get_epiweeks(date):
     date = pd.to_datetime(date)
     epiweek = str(Week.fromdate(date, system="cdc"))  # get epiweeks
     epiweek = epiweek[:4] + '_' + 'EW' + epiweek[-2:]
     return epiweek
import pandas as pd
from epiweeks import Week, Year

df1 = pd.read_csv('data-truth/JHU/truth_JHU-Incident Deaths.csv')
df1.rename(columns={'value': 'inc_death'}, inplace=True)

df2 = pd.read_csv('data-truth/JHU/truth_JHU-Incident Cases.csv')
df2.rename(columns={'value': 'inc_case'}, inplace=True)

# merge cases and deaths into one dataframe
df = df1.merge(df2, on=['date', 'location', 'location_name'])

# add epi weeks for aggregation
df.date = pd.to_datetime(df.date)
df['epi_week'] = df.date.apply(lambda x: Week.fromdate(x).week)
df['epi_year'] = df.date.apply(lambda x: Week.fromdate(x).year)

# aggregate to weekly incidence
df = df.groupby(['location', 'location_name', 'epi_year', 'epi_week']).aggregate(
    {'date': max, 'inc_death': sum, 'inc_case':sum}).reset_index()

# only keep Saturdays
df = df[df.date.dt.day_name() == 'Saturday']

# reformat
df = df[['date', 'location', 'location_name', 'inc_case', 'inc_death']].sort_values(['date', 'location'])

# export
df.to_csv('viz/truth_to_plot.csv', index=False)
def fromDateTime2EW(dt):
    w = Week.fromdate(dt.year, dt.month, dt.day)
    return "{:4}{:2}".format(w.year, w.week)
def parse_background_metadata(query_dict, label_fields, tree_fields, table_fields, background_metadata, present_in_tree, closest_sequences, node_summary_option, tip_to_tree, database_name_column, database_sample_date_column, protected_sequences,context_table_summary_field, date_fields, virus):

    full_tax_dict = query_dict.copy()

    with open(background_metadata, 'r') as f:
        reader = csv.DictReader(f)
        col_name_prep = next(reader)
        col_names = list(col_name_prep.keys())

    old_data = False
    with open(background_metadata, 'r') as f:
        in_data = csv.DictReader(f)
        for sequence in in_data:
            
            seq_name = sequence[database_name_column]
            date = sequence[database_sample_date_column] 
            country = sequence["country"]

            if "adm2_raw" not in col_names: ##for civet
                old_data = True

            if "adm2" in col_names:
                adm2 = sequence['adm2']
                if "|" in adm2:
                    adm2 = "|".join(sorted(adm2.split("|")))

                if "location" in col_names:
                    location_label = sequence["location"]
                else:
                    location_label = adm2

                adm2_present_in_background = True
            else:
                adm2 = ""
                location_label = ""
                adm2_present_in_background = False

            # if virus == "sars-cov-2":	
            #     uk_lineage = sequence["uk_lineage"]	
            #     global_lineage = sequence["lineage"]	
            #     phylotype = sequence["phylotype"]	

            if node_summary_option == "adm2":
                if country != "UK":
                    node_summary_trait = country 
                else:
                    node_summary_trait = sequence["adm2"] 
            else:
                node_summary_trait = sequence[node_summary_option]

            if (seq_name in present_in_tree or seq_name in closest_sequences) and seq_name not in query_dict.keys():
                
                # if virus == "sars-cov-2":	
                #     new_taxon = taxon(seq_name, country, label_fields, tree_fields, table_fields, global_lineage=global_lineage, uk_lineage=uk_lineage, phylotype=phylotype)	
                # else:	
                new_taxon = taxon(seq_name, country, label_fields, tree_fields, table_fields)

                if date == "":
                    date = "NA"
                
                new_taxon.sample_date = date
                new_taxon.node_summary = node_summary_trait
                new_taxon.epiweek = Week.fromdate(convert_date(date))

                if new_taxon.name in protected_sequences:
                    new_taxon.protected = True

                if seq_name in tip_to_tree.keys():
                    new_taxon.tree = tip_to_tree[seq_name]

                new_taxon.attribute_dict["adm2"] = adm2
                new_taxon.attribute_dict["location_label"] = location_label

                new_taxon.input_display_name = seq_name

                for field in label_fields:
                    if field in col_names:
                        if sequence[field] != "NA" and sequence[field] != "": #this means it's not in the input file
                            new_taxon.attribute_dict[field] = sequence[field]

                if context_table_summary_field and context_table_summary_field in col_names:
                    if sequence[context_table_summary_field] != "":
                        new_taxon.attribute_dict["context_table_summary_field"] = sequence[context_table_summary_field]

                for field in table_fields:
                    if field in col_names:
                        if sequence[field] != "NA" and sequence[field] != "":
                            new_taxon.table_dict[field] = sequence[field]

                full_tax_dict[seq_name] = new_taxon

            

            #There may be sequences not in COG tree but that are in the full metadata, so we want to pull out the additional information if it's not in the input csv
            if seq_name in query_dict.keys(): 
                tax_object = query_dict[seq_name]
                if tax_object.sample_date == "NA" and date != "" and date != "NA":
                    tax_object.sample_date = date
                    converted = convert_date(date)
                    tax_object.all_dates.append(converted)
                    tax_object.epiweek = Week.fromdate(converted)

                
                if "adm2" not in tax_object.attribute_dict.keys() and adm2 != "":
                    tax_object.attribute_dict["adm2"] = adm2
                if "location_label" not in tax_object.attribute_dict.keys() and location_label != "":
                    tax_object.attribute_dict["location_label"] = location_label

                if context_table_summary_field and context_table_summary_field in col_names:
                    if sequence[context_table_summary_field] != "" and tax_object.attribute_dict["context_table_summary_field"] == "NA":
                        tax_object.attribute_dict["context_table_summary_field"] = sequence[context_table_summary_field]

                for field in date_fields:
                    if field in reader.fieldnames:
                        if sequence[field] != "" and sequence[field] != "NA" and field not in tax_object.date_dict.keys():
                            date_dt = convert_date(sequence[field])
                            tax_object.date_dict[field] = date_dt 
                    

                for field in tree_fields:
                    if field in col_names:
                        if tax_object.attribute_dict[field] == "NA" and sequence[field] != "NA" and sequence[field] != "": #this means it's not in the input file
                            if field != "adm1":
                                tax_object.attribute_dict[field] = sequence[field]
                            else:
                                if country == "UK":
                                    adm1 = UK_adm1(tax_object.name,sequence[field])
                                else:
                                    adm1 = "Other"
                                tax_object.attribute_dict[field] = adm1

                for field in label_fields:
                    if field in col_names:
                        if tax_object.attribute_dict[field] == "NA" and sequence[field] != "NA" and sequence[field] != "": #this means it's not in the input file
                                tax_object.attribute_dict[field] = sequence[field]

                for field in table_fields:
                    if field in col_names:
                        if tax_object.table_dict[field] == "NA" and sequence[field] != "NA" and sequence[field] != "": #this means it's not in the input file
                                tax_object.table_dict[field] = sequence[field]


                # if virus == "sars-cov-2":
                #     tax_object.global_lineage = global_lineage
                #     tax_object.uk_lineage = uk_lineage
                #     tax_object.phylotype = phylotype


                full_tax_dict[seq_name] = tax_object
                    
    return full_tax_dict, adm2_present_in_background, old_data
def parse_input_csv(input_csv, query_id_dict, input_column, display_name, sample_date_column, tree_fields, label_fields, table_fields, context_table_summary_field, date_fields=None, UK_adm2_dict=None, patient_id_col=None, reinfection=False): 
    
    full_query_count = 0
    new_query_dict = {}
    
    with open(input_csv, 'r') as f:
        reader = csv.DictReader(f)
        col_name_prep = next(reader)
        col_names = list(col_name_prep.keys())
    
    with open(input_csv, 'r') as f:
        in_data = csv.DictReader(f)
        #in_data = [r for r in reader]
        for sequence in in_data:
            full_query_count += 1
            name = sequence[input_column]

            if name in query_id_dict.keys():
                taxon = query_id_dict[name]

                if reinfection:
                    taxon.attribute_dict["patient"] = sequence[patient_id_col]
                    
                taxon.input_display_name = sequence[display_name]

                if reinfection:
                    taxon.attribute_dict["patient"] = sequence[patient_id_col]

                for field in date_fields:
                    if field in reader.fieldnames:
                        if sequence[field] != "" and sequence[field] != "NA":
                            date_dt = convert_date(sequence[field])
                            taxon.date_dict[field] = date_dt 

                if sample_date_column in col_names: #if it's not in the background database or there is no date in the background database but date is provided in the input query
                    if sequence[sample_date_column] != "":
                        taxon.sample_date = sequence[sample_date_column]
                        taxon.epiweek = Week.fromdate(convert_date(sequence[sample_date_column]))

                if context_table_summary_field and context_table_summary_field in col_names:
                    if sequence["context_table_summary_field"] != "":
                        taxon.attribute_dict["context_table_summary_field"] = sequence[context_table_summary_field]
                     
                for col in col_names: #Add other metadata fields provided
                    if col in table_fields:
                        if sequence[col] != "":
                            taxon.table_dict[col] = sequence[col]
                    
                    if col in label_fields:
                        if sequence[col] != "":
                            taxon.attribute_dict[col] = sequence[col]
                    
                    if col in tree_fields and col != input_column and col != "adm1":
                        if sequence[col] != "":
                            taxon.attribute_dict[col] = sequence[col]
                    
                    if taxon.country == "UK": 
                        if col == "adm1":
                            adm1 = UK_adm1(name, sequence[col])
                            taxon.attribute_dict["adm1"] = adm1

                        if col == "adm2":

                            adm2 = sequence["adm2"] 
                            if "|" in adm2:
                                adm2 = "|".join(sorted(adm2.split("|")))
                            
                            taxon.attribute_dict["adm2"] = adm2 

                            if "location" in col_names:
                               location_label = sequence["location"]
                            else:
                                location_label = adm2

                            taxon.attribute_dict["location_label"] = location_label
                            
                            if "adm1" not in col_names and "adm1" in tree_fields:
                                if sequence[col] in UK_adm2_dict.keys():
                                    adm1 = UK_adm2_dict[sequence[col]]
                                    taxon.attribute_dict["adm1"] = adm1               

                new_query_dict[taxon.name] = taxon

      
    return new_query_dict, full_query_count 
Exemple #27
0
    # drop columns where 'country' and 'country_exposure' disagree
    dfN['same_country'] = np.where(dfN['country'] == dfN['country_exposure'],
                                   'yes', 'no')  # compare values
    dfN.loc[dfN['country_exposure'] == '', 'same_country'] = 'yes'
    dfN = dfN[dfN['same_country'].apply(
        lambda x: 'yes' in x)]  # exclude rows with conflicting place of origin
    # print(dfN[['same_country', 'country', 'country_exposure']])
    dfN.drop(columns=['same_country'])

    # print(dfN[['strain', 'date']].iloc[[0, -1]])

    # get epiweek end date, create column
    dfN['date'] = pd.to_datetime(dfN['date'], errors='coerce')
    dfN['epiweek'] = dfN['date'].apply(
        lambda x: Week.fromdate(x, system="cdc").enddate())

    ## SAMPLE FOCAL AND CONTEXTUAL SEQUENCES
    purposes = ['focus', 'context']
    subsamplers = []  # list of focal and contextual categories
    for category in purposes:
        query = {}
        for idx, val in dfS.loc[dfS['purpose'] == category,
                                'name'].to_dict().items():
            key = dfS.iloc[idx]['level']
            if key not in query.keys():
                query[key] = [val]
            else:
                query[key].append(val)
        # print(query)
        subsamplers.append(query)
forecast_start = sys.argv[2]
samples_directory = sys.argv[3]

import numpy as np
from epiweeks import Week, Year

num_weeks = 8
data = util.load_state_data()
places = sorted(list(data.keys()))
#places = ['AK', 'AL']

allQuantiles = [0.01, 0.025] + list(np.arange(0.05, 0.95 + 0.05,
                                              0.05)) + [0.975, 0.99]

forecast_date = pd.to_datetime(forecast_start)
currentEpiWeek = Week.fromdate(forecast_date)

forecast = {
    'quantile': [],
    'target_end_date': [],
    'value': [],
    'type': [],
    'location': [],
    'target': []
}

for place in places:
    prior_samples, mcmc_samples, post_pred_samples, forecast_samples = util.load_samples(
        place, path=samples_directory)
    forecast_samples = forecast_samples['mean_z_future']
    t = pd.date_range(start=forecast_start,
Exemple #29
0
 def get_week_from_date(self, year, date):
     leap = self.get_leap(year)
     epi_date = date + timedelta(days=leap.days)
     epi_week = Week.fromdate(epi_date)
     return epi_week.week
Exemple #30
0
def plotTop40(config, catalog):
    from datetime import datetime
    from epiweeks import Week
    
    pivot = 'artist'
    # we rank artists by most plays, take the top 25 and 
    # add their played tracks to the playlist.
    #pivot = config['pivot'] if 'pivot' in config else 'artist'

    result = {}
    print(f'Grouping by {pivot}')
    for r in catalog:
        if r['track'] is None:
            continue
        artistid = r[pivot]['name']
        if pivot == 'track':
            artistid += ';;'+r['artist']['name']
        week = Week.fromdate(datetime.strptime(r['airdate'], '%Y-%m-%dT%H:%M:%SZ'))
        if week not in result:
            result[week]={}
        if artistid not in result[week]:
            result[week][artistid]={'track':r, 'plays':set(), 'songs':set()}
        
        result[week][artistid]['plays'].add(r['airdate']) # count by unique timestamps. Sometimes the playlist has duplicates.
        result[week][artistid]['songs'].add((r['artist']['name'],r['track']['name']))
    print([w for w in result])
    
    all_results = result
    if 0:
        all_artists = []
        plots = {}
        N=10
        W=20
        weeks = list(sorted(all_results))[-W:]
        for w in weeks:
            result = all_results[w]
            topN = list(sorted(result, key=lambda x: len(result[x]['plays']), reverse=True))[:N]
            print(topN)
            for i,a in enumerate(topN):
                if a not in all_artists:
                    all_artists.append(a)
                if a not in plots:
                    plots[a]=[]
                plots[a].append((w,i))
            #ranks.append(topN)
        y0_values = dict((k,N-v+1) for v,k in enumerate(all_artists))
        x_values = dict([(k,v+1) for (v,k) in enumerate(weeks)])
        import matplotlib.pyplot as plt
        from math import isnan
        fig, ax = plt.subplots(figsize=(12,8)) #subplot_kw=dict(axisbg='#EEEEEE'))
        for a in plots:
            lookup = dict(plots[a])
            X = [x_values[w] for w in weeks] #[x_values[w] for (w,_) in plots[a]]
            Y = [N-lookup[w] if w in lookup else float('NaN') for w in weeks] #N-v for (_,v) in plots[a]]
            ax.plot(X, Y, 'o-')
            for i in range(len(X)):
                if i==0 or (not isnan(Y[i]) and isnan(Y[i-1])):
                    ax.text(X[i],Y[i]+0.15, a, ha='center', fontsize=8)
            ax.set_yticks(range(N+1))
            ax.set_yticklabels(['']+[str(N-y) for y in range(N)])
        plt.show()

    threshold = Week.fromdate(datetime(2020,5,26))
    summary = {}
    from collections import Counter
    denom = Counter()
    for w in all_results:
        isprior = w<threshold
        denom[isprior]+=1
        current = all_results[w]
        for a in current:
            if a not in summary:
                summary[a]=Counter()
            summary[a][isprior]+=len(current[a]['plays'])
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(figsize=(12,8)) 
    labels = []
    N=20
    for i,a in enumerate(list(sorted(summary, key = lambda x: summary[x][True]+summary[x][False]))[-N:]):
        ax.plot([2*i-0.5,2*i+0.5], [summary[a][True]/denom[True], summary[a][False]/denom[False]],'o-')
        labels.append(a)
        print('{}\t{}\t{}'.format(a, summary[a][True]/denom[True], summary[a][False]/denom[False]))
    ax.set_xticks([2*i for i in range(N)])
    ax.set_xticklabels(labels, rotation=90)
    ax.set_ylabel('Plays per Week')
    plt.tight_layout()
    plt.savefig('playsPerWeek.png')
    plt.show()

    if 0:
        import mpld3
        from math import log, ceil

        data = [(x,y,ctr[(x,y)]) for (x,y) in ctr]
        x_track=[d[0] for d in data]
        y_artist=[d[1] for d in data]
        count = [len(d[2]) for d in data]
        labels = [','.join(d[2]) for d in data]
        
        fig, ax = plt.subplots(figsize=(4,2)) #subplot_kw=dict(axisbg='#EEEEEE'))
        scatter = ax.scatter(x_track, y_artist, s=[3*log(c+1) for c in count])
        xint = range(min(x_track), ceil(max(x_track))+1)
        #ax.set_xticks(xint)
        ax.set_xlabel('Track Plays')
        ax.set_ylabel('Artist Plays')
        tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
        mpld3.plugins.connect(fig, tooltip)
        mpld3.show()