# If the dataframe is not empty, then we found a good one
    if len(data) > 10 and len(data['RegionName'].unique()) > 3:
        break

# Convert all dates to ISO format
data['Date'] = data['Date'].apply(lambda date: date.date().isoformat())


def parenthesis(x):
    regexp = r'\((\d+)\)'
    return re.sub(regexp, '', x), (re.search(regexp, x) or [None, None])[1]


# Get the confirmed and deaths data from the table
data['Confirmed'] = data['Value'].apply(
    lambda x: safe_int_cast(parenthesis(x)[0]))
data['Deaths'] = data['Value'].apply(
    lambda x: safe_int_cast(parenthesis(x)[1]))


def aggregate_region_values(group: DataFrame):
    non_null = [value for value in group if not (isna(value) or isnull(value))]
    return None if not non_null else sum(non_null)


# Add up all the rows with same Date and RegionName
data = data.sort_values(['Date', 'RegionName'])
data = data.drop(columns=['Value']).groupby(['RegionName', 'Date'
                                             ]).agg(aggregate_region_values)
data = data.reset_index().sort_values(['Date', 'RegionName'])
Exemple #2
0
def fix_temp(value: int):
    value = safe_int_cast(value)
    return None if value is None else '%.1f' % (value / 10.)
Exemple #3
0
# Parse into datetime object, drop if not possible
df['Date'] = df['Date'].apply(lambda date: safe_datetime_parse(date, date_format))
df = df[~df['Date'].isna()]

# Convert all dates to ISO format
df['Date'] = df['Date'].apply(lambda date: date.date().isoformat())


def parenthesis(x):
    regexp = r'\((\d+)\)'
    return re.sub(regexp, '', x), (re.search(regexp, x) or [None, None])[1]


# Get the confirmed and deaths data from the table
df['Confirmed'] = df['Value'].apply(lambda x: safe_int_cast(parenthesis(x)[0]))
df['Deaths'] = df['Value'].apply(lambda x: safe_int_cast(parenthesis(x)[1]))


def aggregate_region_values(group: DataFrame):
    non_null = [value for value in group if not (isna(value) or isnull(value))]
    return None if not non_null else sum(non_null)


# Add up all the rows with same Date and RegionName
df = df.sort_values(['Date', 'RegionName'])
df = df.drop(columns=['Value']).groupby(['RegionName', 'Date']).agg(aggregate_region_values)
df = df.reset_index().sort_values(['Date', 'RegionName'])

# Compute cumsum of the values region by region
value_columns = ['Confirmed', 'Deaths']