def test_run_order(self):
        """
        run order is either specified when list is passed or resolved by examining dataset
        provenance but that is tested elsewhere.
        Here, check there is a public method that returns a list and elements within this list
        are a tree of lists and sets.
        """
        def is_run_item(r):
            if isinstance(r, list):
                # all items in list must be a set. Set could be one model
                for r2 in r:
                    assert isinstance(r2, set)
                    is_run_item(r2)
            elif isinstance(r, set):
                for r2 in r:
                    assert issubclass(r2, ayeaye.Model) or isinstance(r2, list)
                    if isinstance(r2, list):
                        is_run_item(r2)
            else:
                raise ValueError("Non list and not set item found")
            return True

        for c in [
                ayeaye.Connect(models={One, Two, Five, Six}),
                ayeaye.Connect(models=[One, Two, Five, Six])
        ]:
            run_order = c.run_order()
            self.assertIsInstance(run_order, list)
            self.assertTrue(is_run_item(run_order))
Esempio n. 2
0
class FindLongestAnimalName(ayeaye.PartitionedModel):
    """
    Find the longest common name in a collection of CSV/TSV files. Model suggests to executor how
    to break the task into parallel sub-tasks.
    """

    animals = ayeaye.Connect(engine_url=[
        f"csv://{EXAMPLE_CSV_PATH}",
        f"tsv://{EXAMPLE_TSV_PATH}",
    ])
    animals_output = ayeaye.Connect(access=ayeaye.AccessMode.WRITE)

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.common_names_max = []

    def build(self):
        pass

    def partition_plea(self):
        dataset_files_count = len(self.animals)
        return ayeaye.PartitionedModel.PartitionOption(
            minimum=1,
            maximum=dataset_files_count,
            optimal=dataset_files_count)

    def partition_slice(self, slice_count):

        target_method = "find_longest_name"  # this is the method subtasks should be running
        task_sets = defaultdict(list)
        for idx, dataset in enumerate(self.animals):
            task_id = idx % slice_count
            task_sets[task_id].append(dataset.engine_url)

        return [(target_method, {
            "engine_set": engine_set
        }) for engine_set in task_sets.values()]

    def partition_subtask_complete(self, subtask_method_name, subtask_kwargs,
                                   subtask_return_value):
        if subtask_method_name == "find_longest_name":
            self.common_names_max.append(subtask_return_value)

    def partition_complete(self):
        longest_animal_name = max(self.common_names_max, key=len)
        self.animals_output.data = longest_animal_name

    def find_longest_name(self, engine_set):
        """
        Find the longest common name in the self.animals[engine_url] dataset and return that
        string.
        """
        longest = ""
        for engine_url in engine_set:
            dataset = self.animals[engine_url]
            for row in dataset:
                if len(row.common_name) > len(longest):
                    longest = row.common_name

        return longest
Esempio n. 3
0
class AustralianAnimals(ayeaye.Model):
    """
    Take multiple input json:// datasets listed in a manifest and for each file output a
    corresponding file with just the Australian animals.

    This is a demonstration of :class:`AbstractManifestMapper`.
    """

    animals_manifest = ayeaye.Connect(
        engine_url="json://{input_path}/animals_manifest.json")
    animals_mapper = FileMapper(animals_manifest, "animal_files")

    menagerie = ayeaye.Connect(engine_url=animals_mapper.menagerie)

    australian_animals = ayeaye.Connect(engine_url=animals_mapper.oz_animals,
                                        access=ayeaye.AccessMode.WRITE)

    def build(self):
        for mapper in self.animals_mapper:
            input_dataset = self.menagerie[mapper.menagerie]
            for animal in input_dataset.data.animals:

                if animal.where == "Australia":
                    # multiple output files with 'australian_' prefixed to input file name
                    self.australian_animals[mapper.oz_animals].add(animal)

        self.log("All done!")
Esempio n. 4
0
class Film2Kafka(ayeaye.Model):
    """
    Extract a few fields from an IMDB data file, encode into JSON and send to Kafka.
    """
    imdb_films = ayeaye.Connect(engine_url="tsv://title.basics.tsv")
    output_stream = ayeaye.Connect(engine_url="kafka://localhost/topic=imdb-films",
                                   access=ayeaye.AccessMode.WRITE
                                   )

    def build(self):
        
        self.log("Adding films to Kafka")
        required_fields = ['tconst', 'primaryTitle', 'startYear', 'genres']
        for film in self.imdb_films:

            # Filter out anything that isn't a film
            if film.endYear != r'\N':
                continue

            film_j = film.as_json(select_fields=required_fields)
            self.output_stream.add(film_j)

            # occasionally tell the user how complete the processing is
            msg = f"{self.output_stream.stats.added} films added." 
            self.log_progress(self.imdb_films.progress, msg=msg)

            if DEBUG and self.output_stream.stats.added >= 100000:
                self.log("Debug mode, finishing early.")
                break

        self.log(f"Complete! Added {self.output_stream.stats.added} films.")
    def test_model_iterator(self):
        c = ayeaye.Connect(models={One, Two, Five, Six})
        models = [m for m in c]
        # not ordered when a set of models is passed
        self.assertEqual(4, len(models))

        c = ayeaye.Connect(models=[One, Two, Five, Six])
        models = [m for m in c]
        # ordered when a list
        self.assertEqual([One, Two, Five, Six], models)
Esempio n. 6
0
        class AnimalsModel(ayeaye.Model):
            animals_a = ayeaye.Connect(engine_url="csv://" + EXAMPLE_CSV_PATH)
            animals_b = animals_a.clone()
            animals_output = ayeaye.Connect(access=ayeaye.AccessMode.WRITE)

            def build(self):
                cartesian = []
                for a in self.animals_a:
                    for b in self.animals_b:
                        cartesian.append(f"{a.common_name}_{b.common_name}")
                self.animals_output.data = cartesian
Esempio n. 7
0
class DistributedFakeWork(ayeaye.PartitionedModel):
    """
    Distribute a fake calculation and assemble the results.
    """

    # uses connector_resolver
    non_existant_data = ayeaye.Connect(
        engine_url="file://{hello_partitioned_context}")

    def build(self):
        pass

    def some_work(self, some_number):
        "add the worker id to a test number and append to fully resolved engine_url"
        some_data = self.non_existant_data.file_path + str(some_number)
        return some_data

    def partition_slice(self, _):
        target_method = "some_work"
        return [(target_method, {"some_number": x}) for x in range(10)]

    def partition_subtask_complete(self, subtask_method_name, subtask_kwargs,
                                   subtask_return_value):

        if not hasattr(self, "resultset"):
            self.resultset = []

        if subtask_method_name == "some_work":
            self.resultset.append(subtask_return_value)
Esempio n. 8
0
class FakeModel(ayeaye.Model):
    animals = ayeaye.Connect(engine_url=f"csv://{TEST_DATA_PATH}/" +
                             "{animal_type}.csv", )

    def build(self):
        for a in self.animals:
            self.log(a.common_name)
 def test_single_standalone_model(self):
     c = ayeaye.Connect(models=One)
     msg = (
         "Attribute access should be proxied through Connect to an instance of "
         "ModelsConnector which should refer back to the Connect instance that created it."
     )
     self.assertEqual(c, c.connect_instance, msg=msg)
     self.assertEqual(c.models, [One],
                      "Single model should be proxied through Connect.")
Esempio n. 10
0
 def test_resolve_run_order_readwrite(self):
     c = ayeaye.Connect(models={One, Two, Five, Six})
     r = c._resolve_run_order()
     msg = (
         "There is an ambiguity because Six is WRITE and Five is READWRITE to the same "
         "dataset (f). The write only is happening first. Feels correct but might need "
         "more thought.")
     self.assertEqual([{'One'}, {'Two', 'Six'}, {'Five'}],
                      self.repr_run_order(r.run_order), msg)
Esempio n. 11
0
    def test_favourite_colours_pre_post_checks(self):
        """
        Check .pre_build_check() and .post_build_check()
        """
        output_file = "{}/favourite_colours_summary.json".format(
            self.working_directory())
        m = FavouriteColours()
        m.log_to_stdout = False

        # give the connector a new output file
        m.favourites_summary.update(engine_url=f"json://{output_file}")
        self.assertTrue(
            m.go(),
            "Pre, post and build should work for favourite_colours.csv")

        # now run it with bad data.
        m = FavouriteColours()
        m.log_to_stdout = False
        external_log = StringIO()
        m.set_logger(external_log)

        m.favourite_colours = ayeaye.Connect(
            engine_url='csv://data/favourite_colours_bad_data.csv')
        m.favourites_summary.update(engine_url=f"json://{output_file}")

        # There are two issues.
        # (i) multiple years aren't supported, this should be caught by the pre_build_check
        self.assertFalse(m.pre_build_check(),
                         "favourite_colours_bad_data.csv should fail.")
        m.close_datasets()  # reset file pointers
        external_log.seek(0)
        all_the_logs = external_log.read()
        expected = 'This model is only designed to work with data from a single year.'
        self.assertIn(expected, all_the_logs)

        # (ii) conservation of value - total number of days in the output should match the number of
        # days in the input. This fails because the algorithm in build assumes the data is good and
        # start date is before end date. The bad data file has one pair swapped.
        m.build()  # run it anyway
        m.close_datasets()  # reset file pointers
        self.assertFalse(m.post_build_check(),
                         "favourite_colours_bad_data.csv should fail.")
        external_log.seek(0)
        all_the_logs = external_log.read()
        expected = "Total days in input doesn't match total days in output."
        self.assertIn(expected, all_the_logs)
Esempio n. 12
0
class PoisonousAnimals(ayeaye.Model):
    """
    Super simple ETL.

    Just using normal python data structures, group all the poisonous animals by country where
    they are found.
    """
    poisonous_animals = ayeaye.Connect(
        engine_url='json://data/poisonous_animals.json')

    def build(self):
        by_country = defaultdict(list)
        for animal in self.poisonous_animals.data.animals:
            by_country[animal.where].append(animal.name)

        # Use log this so we can see it
        for country, animals in by_country.items():
            these_animals = ",".join(animals)
            msg = f"In {country} you could find {these_animals}"
            self.log(msg)
Esempio n. 13
0
class FilmGenresSummary(ayeaye.Model):
    """
    Read the extract of IMDB film data from Kafka and count number of films within each genre.
    Output the summary to a JSON document.
    """

    input_stream = Film2Kafka.output_stream.clone(
        access=ayeaye.AccessMode.READ)
    genre_summary = ayeaye.Connect(engine_url="json://films_summary.json",
                                   access=ayeaye.AccessMode.WRITE)

    def build(self):

        self.log("Building a summary of films in the Kafka store")
        genre_summary = defaultdict(int)
        films_processed = 0
        for film in self.input_stream:

            # the 'genres field wasn't broken down into a list. Extract that here.
            for genre in film.genres.split(','):

                # little bit of mapping - Null to word
                if genre == r'\N':
                    genre = 'Unknown'

                genre_summary[genre] += 1

            films_processed += 1

            # occasionally tell the user how complete the processing is
            msg = f"{films_processed} films processes."
            self.log_progress(self.input_stream.progress, msg=msg)

        # Output log of the summary ...
        for genre_name, film_count in genre_summary.items():
            self.log(f"{genre_name} : {film_count} films")

        # ... and output the summary as a dataset
        self.genre_summary.data = genre_summary
        self.log(f"Summary written to {self.genre_summary.engine_url}")
        self.log(f"Complete! Processed {films_processed} films.")
Esempio n. 14
0
    def manifest_data(self):
        """
        This will be called on demand after the global connector_resolver has been setup with any
        context that is needed before reading the manifest_dataset. That context wouldn't have been
        available during construction.

        Returns content of manifest's data attribute. Similar to :method:`manifest_items`.
        """
        if isinstance(self.manifest_dataset_unresolved, ayeaye.Connect):
            # .clone() is to prevent the .Connect being bound to the parent if it's connected
            manifest_dataset = self.manifest_dataset_unresolved.clone()
        else:
            manifest_dataset = self.manifest_dataset_unresolved

        # create ephemeral dataset not tied to an ayeaye.Model
        e_url = ayeaye.connector_resolver.resolve(manifest_dataset.engine_url)
        self._manifest_dataset = ayeaye.Connect(engine_url=e_url)

        if self.field_name is None:
            return self._manifest_dataset.data
        else:
            return self._manifest_dataset.data[self.field_name]
Esempio n. 15
0
    def test_resolve_run_order_linear(self):
        """
        Dataset dependencies used to determine model run order.
        """
        c = ayeaye.Connect(models={One, Two, Three})
        r = c._resolve_run_order()

        leaf_sources = set(
            [c.relayed_kwargs['engine_url'] for c in r.leaf_sources])
        expected_leaf_sources = {"csv://a"}
        self.assertEqual(expected_leaf_sources, leaf_sources)

        leaf_targets = set(
            [c.relayed_kwargs['engine_url'] for c in r.leaf_targets])
        expected_leaf_targets = {"csv://d"}
        self.assertEqual(expected_leaf_targets, leaf_targets)

        msg = "Should be a single linear execution"
        self.assertIsInstance(r.run_order, list, msg)

        self.assertEqual([{'One'}, {'Two'}, {'Three'}],
                         self.repr_run_order(r.run_order), msg)
Esempio n. 16
0
    def __call__(self):
        """
        When used according to the pattern above this will be called on demand by
        :class:`ayeaye.Connect` when a dataset is accessed so the global connector_resolver will
        have been setup with any context that is needed before reading the manifest_dataset. That
        context wouldn't have been available during construction, hense a callable.
        """
        if isinstance(self.manifest_dataset, ayeaye.Connect):
            # .clone() is to prevent the .Connect being bound to the parent if it's connected
            manifest_dataset = self.manifest_dataset.clone()
        else:
            manifest_dataset = self.manifest_dataset

        # create ephemeral dataset not tied to an ayeaye.Model
        e_url = ayeaye.connector_resolver.resolve(manifest_dataset.engine_url)
        ds = ayeaye.Connect(engine_url=e_url)
        dataset_section = ds.data[self.field_name]

        # :class:`connectors.multi_connector.MultiConnector` vs. single engine_urls connectors
        if isinstance(dataset_section, list):
            return [f"{self.engine_type}://{f}" for f in dataset_section]

        return f"{self.engine_type}://{dataset_section}"
Esempio n. 17
0
    def manifest_items(self):
        """
        This will be called on demand after the global connector_resolver has been setup with any
        context that is needed before reading the manifest_dataset. That context wouldn't have been
        available during construction.

        Generator yielding each item in the manifest datasets's target field.
        i.e. list of items in manifest.data[self.field_name]
        """
        if isinstance(self.manifest_dataset_unresolved, ayeaye.Connect):
            # .clone() is to prevent the .Connect being bound to the parent if it's connected
            manifest_dataset = self.manifest_dataset_unresolved.clone()
        else:
            manifest_dataset = self.manifest_dataset_unresolved

        # create ephemeral dataset not tied to an ayeaye.Model
        e_url = ayeaye.connector_resolver.resolve(manifest_dataset.engine_url)
        self._manifest_dataset = ayeaye.Connect(engine_url=e_url)

        if self.field_name is None:
            yield from self._manifest_dataset.data
        else:
            yield from self._manifest_dataset.data[self.field_name]
Esempio n. 18
0
class FavouriteColours(ayeaye.Model):
    """
    Each person has one favourite colour at a time.

    Aggregate these into a summary of number of colour days in each month. This is the number of
    days, in each month, that each colour in the input dataset was someone's favourite colour.

    So if one person liked the colour Blue from 2020-01-01 until 2020-02-15 I'd expect to see this
    in the the output-

        "Blue": {
            "January": 31,
            "February": 14
        },

    This example model is to demonstrate a data validation within `post_build_check` that reveals
    a coding mistake. I wouldn't write code like this unless I'm demonstrating a coding mistake
    (honest). There is a unit test for this model but it misses the two mistakes as well.

    Data validation tests should compliment rather than substitute unit tests. They are another way
    to spot mistakes that can be more intuitive.
    """
    favourite_colours = ayeaye.Connect(
        engine_url='csv://data/favourite_colours.csv')

    # output is in readwrite mode because post_buil_check() reads it back in
    favourites_summary = ayeaye.Connect(
        engine_url='json://data/favourite_colours_summary.json;indent=4',
        access=ayeaye.AccessMode.READWRITE)

    date_format = "%Y-%m-%d"

    def pre_build_check(self):
        """
        Data validation example on input data.
        """
        error_message = (
            "This model is only designed to work with data from a single year. "
            "Both {} and {} have been found in the input dataset.")
        target_year = None
        for survey_record in self.favourite_colours:
            for check_field in ['start', 'end']:
                record_year = datetime.strptime(survey_record[check_field],
                                                self.date_format).year

                if target_year is None:
                    target_year = record_year

                if target_year != record_year:
                    self.log(error_message.format(target_year, record_year),
                             "ERROR")
                    return False

        return True

    def build(self):

        by_colour = defaultdict(lambda: defaultdict(int))
        for survey_record in self.favourite_colours:

            start = datetime.strptime(survey_record.start, self.date_format)
            end = datetime.strptime(survey_record.end, self.date_format)
            date_delta = end - start

            unaccounted_days = date_delta.days
            while unaccounted_days > 0:

                for month in range(start.month, end.month + 1):

                    month_name = calendar.month_name[month]
                    days_in_month = calendar.monthrange(start.year, month)[1]

                    if days_in_month < unaccounted_days:
                        unaccounted_days -= days_in_month
                        by_colour[
                            survey_record.colour][month_name] += days_in_month
                    else:
                        by_colour[survey_record.
                                  colour][month_name] += unaccounted_days
                        unaccounted_days = 0
                        break

        # write the results to a JSON file
        self.favourites_summary.data = by_colour

        self.log("Done!")

    def post_build_check(self):

        error_message = (
            "Total days in input doesn't match total days in output. Input has {} "
            "days and output has {} days.")

        input_days = 0
        for survey_record in self.favourite_colours:

            start = datetime.strptime(survey_record.start, self.date_format)
            end = datetime.strptime(survey_record.end, self.date_format)
            date_delta = end - start
            input_days += abs(date_delta.days)

        output_days = 0
        for month_days in self.favourites_summary.data.values():
            output_days += sum([days for days in month_days.values()])

        if input_days != output_days:
            self.log(error_message.format(input_days, output_days), "ERROR")
            return False

        return True
Esempio n. 19
0
class Two(ayeaye.Model):
    b = One.b.clone(access=ayeaye.AccessMode.READ)
    c = ayeaye.Connect(engine_url="csv://c", access=ayeaye.AccessMode.WRITE)
Esempio n. 20
0
class Nine(ayeaye.Model):
    i = ayeaye.Connect(engine_url=another_find_destination,
                       access=ayeaye.AccessMode.WRITE)
    h = ayeaye.Connect(engine_url="csv://h", access=ayeaye.AccessMode.WRITE)
Esempio n. 21
0
class One(ayeaye.Model):
    a = ayeaye.Connect(engine_url="csv://a")
    b = ayeaye.Connect(engine_url="csv://b", access=ayeaye.AccessMode.WRITE)
Esempio n. 22
0
class Eight(ayeaye.Model):
    g = Seven.g.clone(access=ayeaye.AccessMode.READ)
    h = ayeaye.Connect(engine_url="csv://h", access=ayeaye.AccessMode.WRITE)
Esempio n. 23
0
class Seven(ayeaye.Model):
    b = One.b.clone(access=ayeaye.AccessMode.READ)
    g = ayeaye.Connect(engine_url=find_destination,
                       access=ayeaye.AccessMode.WRITE)
Esempio n. 24
0
class Five(ayeaye.Model):
    b = One.b.clone(access=ayeaye.AccessMode.READ)
    f = ayeaye.Connect(engine_url="sqlite:////data/f.db",
                       access=ayeaye.AccessMode.READWRITE)
Esempio n. 25
0
 def test_resolve_run_order_one_branch(self):
     c = ayeaye.Connect(models={One, Two, Four})
     r = c._resolve_run_order()
     self.assertEqual([{'One'}, {'Two', 'Four'}],
                      self.repr_run_order(r.run_order))
Esempio n. 26
0
 def test_resolve_with_two_different_callables(self):
     c = ayeaye.Connect(models={One, Nine, Seven})
     r = c._resolve_run_order()
     self.assertEqual([{'One', 'Nine'}, {'Seven'}],
                      self.repr_run_order(r.run_order))
Esempio n. 27
0
class Three(ayeaye.Model):
    c = Two.c.clone(access=ayeaye.AccessMode.READ)
    d = ayeaye.Connect(engine_url="csv://d", access=ayeaye.AccessMode.WRITE)
Esempio n. 28
0
 def test_resolve_with_callable(self):
     "Seven has a callable to build it's engine_url at build time"
     c = ayeaye.Connect(models={One, Eight, Seven})
     r = c._resolve_run_order()
     self.assertEqual([{'One'}, {'Seven'}, {'Eight'}],
                      self.repr_run_order(r.run_order))
Esempio n. 29
0
class Four(ayeaye.Model):
    b_copy_paste = ayeaye.Connect(engine_url="csv://b",
                                  access=ayeaye.AccessMode.READ)
    e = ayeaye.Connect(engine_url="csv://e", access=ayeaye.AccessMode.WRITE)
Esempio n. 30
0
class FakeModel(ayeaye.Model):
    animals = ayeaye.Connect(engine_url=f"csv://{EXAMPLE_CSV_PATH}")

    def build(self):
        for a in self.animals:
            self.log(a.common_name)