Esempio n. 1
0
def scan_for_properties(pathname):

    properties = []

    if os.path.isdir(pathname):
        pathnames = scan_for_regular_files(pathname)

        for pathname in pathnames:
            properties += scan_for_properties(pathname)
    else:

        # See if we can open the file as a LUE dataset. If not, issue a
        # warning. If so, obtain the internal paths of properties.
        try:

            dataset = lue.open_dataset(pathname, lue.access_flag.ro)
            properties += [
                Property(pathname, property_pathname) for property_pathname in
                scan_phenomena_for_properties(dataset.phenomena)
            ]
            properties += [
                Property(pathname, property_pathname) for property_pathname in
                scan_universes_for_properties(dataset.universes)
            ]

        except RuntimeError:
            pass
            print("Skipping non-LUE file {}".format(pathname))

    return properties
Esempio n. 2
0
def describe_datasets(
        pathnames,
        indent=0):

    print_message(indent, "datasets")
    indent += 1

    for pathname in pathnames:
        dataset = lue.open_dataset(pathname)
        describe_dataset(dataset, indent)
Esempio n. 3
0
def get_nrrow_nrcol_west_south_north_east(hdf5file, phenomena_name):
    dataset = lue.open_dataset(hdf5file, "r")
    phenomenon = dataset.phenomena[phenomena_name]
    pset = dataset.phenomena[phenomena_name].property_sets["area"]

    nr_rows = pset["band_1"].space_discretization.values[:][0][0]
    nr_cols = pset["band_1"].space_discretization.values[:][0][1]

    nl_west = pset.domain.space.items[:][0][0]
    nl_south = pset.domain.space.items[:][0][1]
    nl_north = pset.domain.space.items[:][0][3]
    nl_east = pset.domain.space.items[:][0][2]
    return nr_rows, nr_cols, nl_west, nl_south, nl_north, nl_east
Esempio n. 4
0
    def assertDatasetIsValid(self, dataset):
        """
        Validate *dataset*
        """

        if isinstance(dataset, str):
            self.assertTrue(os.path.exists(dataset_pathname))
            dataset = lue.open_dataset(dataset_pathname)

        try:
            lue.assert_is_valid(dataset, fail_on_warning=True)
        except RuntimeError as exception:
            self.fail("dataset {} is not valid\n{}".format(
                dataset.pathname, exception))
Esempio n. 5
0
def post_process_benchmarks(lue_pathname):

    lue_dataset = lue.open_dataset(lue_pathname)
    lue_benchmark = lue_dataset.phenomena["benchmark"]

    lue_meta_information = \
        lue_benchmark.collection_property_sets["meta_information"]
    lue_name = lue_meta_information.properties["name"]
    lue_system_name = lue_meta_information.properties["system_name"]

    benchmark_name = lue_name.value[:]
    assert (len(benchmark_name) == 1)
    benchmark_name = benchmark_name[0]

    time_point = "todo"

    system_name = lue_system_name.value[:]
    assert (len(system_name) == 1)
    system_name = system_name[0]

    lue_measurement = lue_benchmark.property_sets["measurement"]
    lue_nr_localities = lue_measurement.properties["nr_localities"]
    lue_nr_threads = lue_measurement.properties["nr_threads"]
    lue_work_size = lue_measurement.properties["work_size"]
    lue_duration = lue_measurement.properties["duration"]

    nr_localities = lue_nr_localities.value[:]
    nr_measurements = len(nr_localities)
    nr_threads = lue_nr_threads.value[:]
    assert (len(nr_threads) == nr_measurements)
    work_size = lue_work_size.value[:]
    assert (len(work_size) == nr_measurements)

    duration = lue_duration.value[:]
    assert (len(duration) == nr_measurements)
    nr_durations = len(duration[0])

    # Set up data frames
    # The (default) index is the index of the benchmark
    environment = pd.DataFrame({
        "nr_localities": nr_localities,
        "nr_threads": nr_threads,
        "work_size": work_size,
    })

    # Per benchmark a series. Each series contains all duration measurements.
    # These series are concatenated in one long series containing the
    # durations for all benchmarks. The index contains the index of
    # the benchmark.
    durations = [
        pd.Series(duration[b], index=nr_durations * [b])
        for b in range(nr_measurements)
    ]
    durations = pd.DataFrame({"duration": pd.concat(durations)})


    nr_equal_work_sizes = \
        (environment["work_size"] == environment["work_size"][0]).sum()
    constant_work_size = nr_equal_work_sizes == nr_measurements

    if constant_work_size:
        post_process_strong_scaling_benchmarks(benchmark_name, time_point,
                                               system_name, environment,
                                               durations)
    else:
        post_process_weak_scaling_benchmarks(benchmark_name, time_point,
                                             system_name, environment,
                                             durations)
Esempio n. 6
0
    def initial(self):
        self.startmap = scalar(0)
        self.startmap_home = scalar(0)

        self.lue_name = os.path.join(
            "LUE", "exposure_{0}.lue".format(str(self.currentSampleNumber())))
        self.currentDate = self.startDate
        #pcraster.setrandomseed(self.currentSampleNumber()*1000)

        #random.seed(self.currentSampleNumber()*1000)
        self.workdf = self.workdf.sample(
            frac=1,
            replace=True,
            random_state=self.currentSampleNumber() *
            1000)  #randomly sample working locations

        self.work_realisation = os.path.join(str(self.currentSampleNumber()),
                                             "work_realisation.csv")
        self.workloc = os.path.join(str(self.currentSampleNumber()),
                                    "work_loc.map")  # for testing
        self.workdf.to_csv(self.work_realisation,
                           header=False)  #save each realisation
        cmd = "col2map -S -s, --clone {0} -x 2 -y 3 -v 1 {1} {2} ".format(
            self.road_length_5000_file, self.work_realisation, self.workloc)
        subprocess.check_call(cmd, shell=True)
        #get extent form hdf file

        nr_rows, nr_cols, nl_west, nl_south, nl_north, nl_east = get_nrrow_nrcol_west_south_north_east(
            self.hdf5file, self.phenomena_name)
        cellsize = (nl_east - nl_west) / nr_cols

        self.window_size_x = int(nr_rows)
        self.window_size_y = int(nr_cols)

        print "input dataset:", nl_west, nl_south, nl_north, nl_east, nr_rows, nr_cols, cellsize, self.window_size_x, self.window_size_y

        # Here create the new dataset if LUE does not exists.
        if os.path.isfile(self.lue_name) == False:

            dataset = lue.create_dataset(self.lue_name)
            # add phenomenon
            phenomenon_exposure = dataset.add_phenomenon(self.lue_phenomena)
            #add propertyset
            ps_points = create_propertyset(phenomenon_exposure,
                                           self.lue_ps_points)
            ps_areas = create_propertyset(phenomenon_exposure,
                                          self.lue_ps_area)

            #load properties

            # ids for the properties are necessary for now
            ids_front = ps_points.reserve(self.nr_locations)
            ids_area = ps_areas.reserve(self.nr_locations)
            # assign a unique id
            ids_front[:] = range(0, self.nr_locations)
            ids_area[:] = range(0, self.nr_locations)

            load_route_LUE(ps_areas, self.nr_locations, self.homedf,
                           self.workdf, self.window_size_x, self.window_size_y,
                           self.lue_p_area_route)
            #load work and home locations to LUE
            load_home_work_LUE(ps_points, self.nr_locations, self.homedf,
                               self.workdf, self.lue_p_points_home,
                               self.lue_p_points_home_rowcol,
                               self.lue_p_points_work,
                               self.lue_p_points_work_rowcol, nl_west,
                               nl_north, cellsize)

            lue.assert_is_valid(dataset)

#open LUE for use
        dataset = lue.open_dataset(self.lue_name, "w")
        phenomenon = dataset.phenomena[self.lue_phenomena]
        self.route_set = phenomenon.property_sets[self.lue_ps_area]
        self.pslocations = phenomenon.property_sets[self.lue_ps_points]

        self.timestep = 1
        #self.exposure_Map = scalar(1)

        # self.array0= numpy.zeros((self.window_size_x * self.window_size_y,), dtype=numpy.float32)
        # self.clone_array_home = self.array0.reshape(self.window_size_x,self.window_size_y)
        # for i in range(1, self.nr_locations):
        #                 home_loc_row = int(self.pslocations[self.lue_p_points_home_rowcol].values[i][:][0])
        #                 home_loc_col = int(self.pslocations[self.lue_p_points_home_rowcol].values[i][:][1])
        #                # w_loc_row = self.pslocations[self.lue_p_points_work_rowcol].values[i][:][0]
        #                # w_loc_col = self.pslocations[self.lue_p_points_work_rowcol].values[i][:][1]
        #                # home_loc_row1 = self.pslocations[self.lue_p_points_home].values[i][:][0])
        #                # home_loc_col2 =  self.pslocations[self.lue_p_points_home].values[i][:][1])
        #                # print home_loc_row, home_loc_col, home_loc_row1, home_loc_col2, w_loc_col,w_loc_row
        #                 self.clone_array_home[ home_loc_row, home_loc_col ]=1
        # self.startcell_home = numpy2pcr(Boolean,  self.clone_array_home, 0.0) # for homemakers
        self.test_dest = scalar(0)
Esempio n. 7
0
    def test_case_study(self):

        # Time series as implemented here:
        # - Discharge at catchment outlets
        #     - Located at fixed points in space
        #     - Variable number of outlets per time cell
        #     - Presence of outlets is discretized within multiple time boxes

        # - Time domain contains time cells
        # - Space domain contains space points
        # - Property values are same_shape::constant_shape (shape of value is
        #     related to what is stored per cell)
        # - Property values are not discretized
        # - Per time cell the set of active objects is tracked
        # - Use this approach if the active set is variable within a
        #     time box
        #     - - Additional storage required for tracking active sets,
        #         compared to Time series I
        #     - + Possible to let objects be 'born' and 'die' during
        #         iterative simulation

        dataset = lue.create_dataset("outlets2.lue")
        phenomenon = dataset.add_phenomenon("areas")

        # Assume we are simulating some temporal variable (discharge at
        # catchment outlets).
        # The existance of the objects is modelled using time cells,
        # which are discretized time boxes (daily time steps). Per cell we
        # can store which objects are active.
        # Property values are located in time at time cells.
        # Property values are located in space at stationary space points.

        # Time domain
        time_configuration = lue.TimeConfiguration(lue.TimeDomainItemType.cell)
        epoch = lue.Epoch(lue.Epoch.Kind.common_era, "2019-01-01",
                          lue.Calendar.gregorian)
        clock = lue.Clock(epoch, lue.Unit.day, 1)
        time_coordinate_datatype = lue.dtype.TickPeriodCount

        # Space domain
        space_configuration = lue.SpaceConfiguration(
            lue.Mobility.stationary, lue.SpaceDomainItemType.point)
        space_coordinate_datatype = numpy.dtype(numpy.float32)
        rank = 2

        # Property set
        outlet_points = phenomenon.add_property_set("outlets",
                                                    time_configuration, clock,
                                                    space_configuration,
                                                    space_coordinate_datatype,
                                                    rank)
        time_domain = outlet_points.time_domain
        space_domain = outlet_points.space_domain
        active_set_index = outlet_points.object_tracker.active_set_index
        active_object_id = outlet_points.object_tracker.active_object_id

        # Property
        discharge_datatype = numpy.dtype(numpy.float32)
        discharge = outlet_points.add_property(
            "discharge",
            dtype=discharge_datatype,
            shape=(1, ),
            value_variability=lue.ValueVariability.variable)

        nr_time_boxes = 5
        max_nr_objects = 100

        # Iterate over the time boxes
        for t in range(nr_time_boxes):

            # Store additional time box and count
            time_box = numpy.array([t, t + 1], dtype=time_coordinate_datatype)
            time_domain.value.expand(1)[-1] = time_box
            count = int(10 * random.random())
            time_domain.value.count.expand(1)[-1] = count

            # Iterate over the time cells within each time box
            for c in range(count):

                # Store IDs of objects in the active set
                object_index = active_object_id.nr_ids
                active_set_index.expand(1)[-1] = object_index
                nr_objects = int(random.random() * max_nr_objects)

                object_id = numpy.empty(nr_objects, dtype=lue.dtype.ID)
                lue.test.select_random_ids(object_id, max_nr_objects)
                active_object_id.expand(nr_objects)[object_index:] = object_id

                # Store property values of active objects
                discharge_values = \
                    numpy.arange(nr_objects, dtype=discharge_datatype)
                discharge.value.expand(nr_objects)[object_index:] = \
                    discharge_values

        lue.assert_is_valid(dataset)

        del dataset

        dataset = lue.open_dataset("outlets2.lue")
        phenomenon = dataset.phenomena["areas"]
        outlet_points = phenomenon.property_sets["outlets"]
        time_domain = outlet_points.time_domain
        clock = time_domain.clock

        self.assertEqual(clock.epoch.kind, lue.Epoch.Kind.common_era)
        self.assertEqual(clock.epoch.origin, "2019-01-01")
        self.assertEqual(clock.epoch.calendar, lue.Calendar.gregorian)
        self.assertEqual(clock.unit, lue.Unit.day)
        self.assertEqual(clock.nr_units, 1)
Esempio n. 8
0
def post_process_raw_results(
        lue_dataset_pathname,
        plot_pathname):
    """
    Create plots and tables from raw benchmark results
    """
    lue_dataset = lue.open_dataset(lue_dataset_pathname)
    lue_benchmark = lue_dataset.phenomena["benchmark"]
    lue_meta_information = \
        lue_benchmark.collection_property_sets["meta_information"]
    lue_measurement = lue_benchmark.property_sets["measurement"]

    meta_information = meta_information_dataframe(lue_meta_information)
    name = meta_information.name[0]
    system_name = meta_information.system_name[0]
    worker_type = meta_information.worker_type[0]
    nr_time_steps = meta_information.nr_time_steps[0]

    nr_arrays, rank = \
        lue_meta_information.properties["array_shape"].value.shape
    assert nr_arrays == 1
    nr_benchmarks, count = lue_measurement.properties["duration"].value.shape

    measurement = measurement_dataframe(lue_measurement)


    # The time point at which the experiment was performed is the epoch
    # of the time domain used to store the durations
    lue_clock = lue_measurement.time_domain.clock
    assert lue_clock.nr_units == 1
    time_point_units = lue_clock.unit

    lue_epoch = lue_clock.epoch
    assert lue_epoch.kind == lue.Epoch.Kind.common_era
    assert lue_epoch.calendar == lue.Calendar.gregorian
    time_point = dateutil.parser.isoparse(lue_epoch.origin)

    # String containing time point in local time zone and conventions
    # time_point = time_point.astimezone(tzlocal.get_localzone()).strftime("%c")
    time_point = time_point.strftime("%c")

    nr_workers = measurement["nr_workers"]
    duration_labels = ["duration_{}".format(i) for i in range(count)]

    # t1 = duration using one worker
    t1 = measurement.loc[nr_workers == 1].filter(items=duration_labels)
    # t1 = [t1["duration_{}".format(i)][0] for i in range(count)]
    t1 = [t1.iat[0, i] for i in range(count)]

    for i in range(count):
        # Best case: duration stays constant with increasing the number of
        # workers and amount of work (and keeping the amount of work /
        # worker constant)
        # 100% parallel code, but without parallelization overhead
        measurement["linear_duration_{}".format(i)] = \
            [t1[i] for b in range(nr_benchmarks)]

        # Worst case: duration scales with number of workers
        # 100% serial code, but without parallelization overhead
        measurement["serial_duration_{}".format(i)] = t1[i] * nr_workers

        ### # slow_down = tn / linear_duration
        ### measurement["relative_slow_down_{}".format(i)] = \
        ###     (measurement["duration_{}".format(i)] / \
        ###     measurement["linear_duration_{}".format(i)]) - 1
        ### measurement["linear_relative_slow_down_{}".format(i)] = \
        ###     (measurement["linear_duration_{}".format(i)] / \
        ###     measurement["linear_duration_{}".format(i)]) - 1
        ### measurement["serial_relative_slow_down_{}".format(i)] = \
        ###     (measurement["serial_duration_{}".format(i)] / \
        ###     measurement["linear_duration_{}".format(i)]) - 1

        # efficiency = 100% * t1 / tn
        measurement["efficiency_{}".format(i)] = \
            100 * t1[i] / measurement["duration_{}".format(i)]
        measurement["linear_efficiency_{}".format(i)] = \
            100 * t1[i] / measurement["linear_duration_{}".format(i)]
        measurement["serial_efficiency_{}".format(i)] = \
            100 * t1[i] / measurement["serial_duration_{}".format(i)]

        # lups = nr_time_steps * nr_elements / duration
        # In the case of weak scaling, the nr_elements increases with the
        # nr_workers. Ideally, LUPS increases linearly with the nr_workers.
        measurement["lups_{}".format(i)] = \
            nr_time_steps * measurement["nr_elements"] / \
            measurement["duration_{}".format(i)]
        measurement["linear_lups_{}".format(i)] = \
            nr_time_steps * measurement["nr_elements"] / \
            measurement["linear_duration_{}".format(i)]
        measurement["serial_lups_{}".format(i)] = \
            nr_time_steps * measurement["nr_elements"] / \
            measurement["serial_duration_{}".format(i)]


    # https://xkcd.com/color/rgb/
    serial_color = sns.xkcd_rgb["pale red"]
    linear_color = sns.xkcd_rgb["medium green"]
    actual_color = sns.xkcd_rgb["denim blue"]

    nr_plot_rows = 2
    nr_plot_cols = 2
    plot_width = 8  # Inches...
    plot_height = 6  # Inches...
    figure, axes = plt.subplots(
            nrows=nr_plot_rows, ncols=nr_plot_cols,
            figsize=(nr_plot_cols * plot_width, nr_plot_rows * plot_height),
            squeeze=False, sharex=False,
        )

    plot_row, plot_col = 0, 0

    # duration by nr_workers
    linear_duration = select_data_for_plot(
        measurement, "linear_duration", count)
    serial_duration = select_data_for_plot(
        measurement, "serial_duration", count)
    duration = select_data_for_plot(
        measurement, "duration", count)

    sns.lineplot(
        data=linear_duration, x="nr_workers", y="linear_duration",
        ax=axes[plot_row, plot_col], color=linear_color)
    sns.lineplot(
        data=serial_duration, x="nr_workers", y="serial_duration",
        ax=axes[plot_row, plot_col], color=serial_color)
    sns.lineplot(
        data=duration, x="nr_workers", y="duration",
        ax=axes[plot_row, plot_col], color=actual_color)
    axes[plot_row, plot_col].set_ylabel(
        u"duration ({}) ± 95% ci (count={})".format(
            time_point_units, count))
    axes[plot_row, plot_col].yaxis.set_major_formatter(
        ticker.FuncFormatter(
            lambda y, pos: format_duration(y)))

    plot_row, plot_col = 0, 1

    linear_efficiency = select_data_for_plot(
        measurement, "linear_efficiency", count)
    serial_efficiency = select_data_for_plot(
        measurement, "serial_efficiency", count)
    efficiency = select_data_for_plot(
        measurement, "efficiency", count)

    sns.lineplot(
        data=linear_efficiency, x="nr_workers", y="linear_efficiency",
        ax=axes[plot_row, plot_col], color=linear_color)
    sns.lineplot(
        data=serial_efficiency, x="nr_workers", y="serial_efficiency",
        ax=axes[plot_row, plot_col], color=serial_color)
    sns.lineplot(
        data=efficiency, x="nr_workers", y="efficiency",
        ax=axes[plot_row, plot_col], color=actual_color)
    axes[plot_row, plot_col].set_ylim(0, 110)
    axes[plot_row, plot_col].set_ylabel("efficiency (%)")

    plot_row, plot_col = 1, 0

    # lups by nr_workers
    linear_lups = select_data_for_plot(
        measurement, "linear_lups", count)
    serial_lups = select_data_for_plot(
        measurement, "serial_lups", count)
    lups = select_data_for_plot(
        measurement, "lups", count)

    sns.lineplot(
        data=linear_lups, x="nr_workers", y="linear_lups",
        ax=axes[plot_row, plot_col], color=linear_color)
    sns.lineplot(
        data=serial_lups, x="nr_workers", y="serial_lups",
        ax=axes[plot_row, plot_col], color=serial_color)
    sns.lineplot(
        data=lups, x="nr_workers", y="lups",
        ax=axes[plot_row, plot_col], color=actual_color)
    axes[plot_row, plot_col].set_ylabel("LUPS")

    plot_row, plot_col = 1, 1
    axes[plot_row, plot_col].axis("off")

    for plot_row in range(nr_plot_rows):
        for plot_col in range(nr_plot_cols):
            axes[plot_row, plot_col].xaxis.set_major_formatter(
                ticker.FuncFormatter(
                    lambda x, pos: format_nr_workers(x)))
            axes[plot_row, plot_col].set_xlabel(
                "workers ({})".format(worker_type))
            axes[plot_row, plot_col].grid()

    figure.legend(labels=["linear", "serial", "actual"])

    array_shape_per_worker = \
        lue_meta_information.properties["array_shape"].value[0]
    partition_shape = \
        lue_meta_information.properties["partition_shape"].value[0]

    figure.suptitle(
        "{}, {}, {}\n"
        "Weak scaling experiment on {} array per worker and {} partitions"
            .format(
                name,
                system_name,
                time_point,
                "x".join([str(extent) for extent in array_shape_per_worker]),
                "x".join([str(extent) for extent in partition_shape]),
            )
        )

    # plt.tight_layout()
    plt.savefig(plot_pathname)