def test_order_by():
    data = [(i, 1, 2) for i in range(5)]
    dataset = DataSet(data)

    dataset.order_by('clinic_id', descending=True)
    for i in range(5):
        assert dataset[i][0] == 4 - i
def test_aggregate_on():
    data = [(1, 2, i) for i in range(5)]
    dataset = DataSet(data)
    dataset_data = dataset.data

    agg = dataset.aggregate_on('clinic_id', key=lambda x: x)

    assert agg == {1: dataset_data}
def test_aggregate_on_dates(test_data):
    dataset = DataSet(test_data)

    agg = dataset.aggregate_on('date_received', key=lambda d: str(d.date()))

    assert len(agg['2020-01-01']) == 1
    assert len(agg['2020-01-03']) == 2
    assert len(agg['2020-01-04']) == 1
    assert len(agg['2020-01-05']) == 1
    assert len(agg['2020-01-07']) == 4
    assert len(agg['2020-01-11']) == 2
Ejemplo n.º 4
0
def run(years=1, year_target=5, epochs=100, epochs_target=500):
    print(f'Generating {years} year(s) of data...')
    data_start = time.time()

    data = create_data(datetime(2020 - years, 1, 1), datetime(2020, 1, 1))
    dataset = DataSet(data)

    print('Data generation done')
    print(f'Took {time.time() - data_start}s\n')

    model = RadiusVariance(seq_size=30, radius=15)
    ml_dataset = model.create_ml_dataset(dataset)

    print(f'Training for {epochs} epochs...')
    train_start = time.time()
    model.get_model().compile(optimizer=Adam(lr=0.001), loss='mse')
    model.get_model().fit(x=ml_dataset.inputs,
                          y=ml_dataset.outputs,
                          epochs=epochs)
    print('Training done')
    train_time = int(time.time() - train_start)
    print(f'Took {train_time}s\n')
    total_time = train_time * (year_target / years) * (epochs_target / epochs)

    print(
        f'It would take this machine an estimated {int(total_time // 60)}m{int(total_time % 60)}s '
        + f'to train {year_target} years of data for {epochs_target} epochs.')
def test_create_ml_dataset_empty():
    model = RadiusVariance(seq_size=3, radius=2)
    dataset = DataSet([])
    ml_dataset = model.create_ml_dataset(dataset)

    assert np.array_equal(ml_dataset.inputs[0], np.array([]))
    assert np.array_equal(ml_dataset.inputs[1], np.empty((0, 43)))
    assert np.array_equal(ml_dataset.outputs[0], np.empty((0, 2)))
Ejemplo n.º 6
0
def visualize_dataset_arrivals(dataset: DataSet, output_file: str):
    dataset.order_by('date_received')
    first = dataset[0].date_received
    last = dataset[-1].date_received
    agg = dataset.aggregate_on('date_received', lambda dr: str(dr.date()))
    x = list(_date_range(first, last))
    x2 = []
    y = []
    i = 0
    for x_val in x:
        i += 1
        x2.append(i)
        y.append(
            len(agg[str(x_val.date())]) if str(x_val.date()) in agg else 0)

    plt.plot_date(x, y, markersize=2)
    plt.gcf().autofmt_xdate()
    plt.savefig(output_file)
    def create_ml_dataset(self, dataset: DataSet) -> MLDataSet:
        """
        Build a MLDataSet compatible with RadiusVariance using the provided DataSet.
        :param dataset: The DataSet to construct the MLDataSet from.
        :return: The MLDataSet
        """
        if len(dataset):
            dataset.order_by('date_received')
            first = dataset[0]
            last = dataset[-1]
            min_date = first.date_received + self._timedelta(self.radius)
            max_date = last.date_received - self._timedelta(self.radius)

            if self.time_interval == TimeInterval.WEEK:
                date_aggregation = dataset.aggregate_on('date_received', lambda dr: self._datestr(dr))
            else:
                date_aggregation = dataset.aggregate_on('date_received', lambda dr: self._datestr(dr))

            date_range = list(self._date_range(min_date, max_date))
        else:
            date_range = []

        x = [np.zeros((len(date_range), 1)), np.zeros((len(date_range), 12 + 31))]
        y = [np.zeros((len(date_range), 2))]

        for i, date in enumerate(date_range):
            date_str = self._datestr(date)
            val = len(date_aggregation[date_str]) if date_str in date_aggregation else 0

            radius_vals = []
            for j in range(-self.radius, self.radius + 1):
                j_date_str = self._datestr(date + self._timedelta(j))
                radius_vals.append(len(date_aggregation[j_date_str]) if j_date_str in date_aggregation else 0)
            variance = np.var(radius_vals)

            one_hot_date = np.zeros(12 + 31)
            one_hot_date[date.date().month] = 1
            one_hot_date[11 + date.date().day] = 1

            x[0][i] = val
            x[1][i] = one_hot_date
            y[0][i] = [val, variance]

        return MLDataSet(x[0:1], x[1:], y, self.seq_size)
def test_dataset():
    data = []
    with open('tests/test_data.txt') as f:
        for line in f.readlines():
            line = line.rstrip().split(',')
            data.append((
                int(line[0]),
                int(line[1]),
                datetime.strptime(line[2], '%Y-%m-%d'),
            ))

    return DataSet(data)
Ejemplo n.º 9
0
def _load_dataset_from_file(file_name: Text) -> DataSet:
    data = []
    with open(file_name) as f:
        for line in f.readlines():
            line = line.rstrip().split(',')
            data.append((
                int(line[0]),
                int(line[1]),
                datetime.strptime(line[2], '%Y-%m-%d'),
            ))

    return DataSet(data)
Ejemplo n.º 10
0
def cyclic(start_date: datetime, end_date: datetime, random_multiple=0) -> DataSet:
    fn = lambda d: max(round(
        20 + d/300 + 2*math.sin(2 * math.pi * d/365) + random_multiple*random.random()
    ), 0)
    data = []
    i = 0
    for d, date in enumerate(date_range(start_date, end_date)):
        arrivals = fn(d)
        for arrival in range(arrivals):
            data.append((1, get_random_severity(), date))
            i += 1

    return DataSet(data)
def test_create_ml_dataset_radius_affects_length(test_dataset):
    model = RadiusVariance(seq_size=3, radius=3)
    dataset = DataSet(test_dataset)
    ml_dataset = model.create_ml_dataset(dataset)

    assert ml_dataset.inputs[0].shape[0] == 2
def test_create_ml_dataset_correct_length(test_dataset):
    model = RadiusVariance(seq_size=1, radius=1)
    dataset = DataSet(test_dataset)
    ml_dataset = model.create_ml_dataset(dataset)

    assert ml_dataset.inputs[0].shape[0] == 8
def test_dataset_index():
    data = [('a', 'b', 'c'), (1, 2, 3)]
    dataset = DataSet(data)

    assert dataset[0][0] == 'a'
    assert dataset[1][0] == 1
def test_dataset_len():
    data = 5 * [('a', 'b', 'c')]
    dataset = DataSet(data)

    assert len(dataset) == 5