Beispiel #1
0
def import_dataset(_file, dataset):
    """
    For reading a URL and saving the corresponding dataset.
    """
    dframe = read_csv(_file)
    Dataset.build_schema(dataset, dframe.dtypes)
    Observation.save(dframe, dataset)
Beispiel #2
0
 def test_delete(self):
     self._save_observations()
     records = [x for x in Observation.find(self.dataset)]
     self.assertNotEqual(records, [])
     Observation.delete(self.dataset)
     records = [x for x in Observation.find(self.dataset)]
     self.assertEqual(records, [])
Beispiel #3
0
 def _save_records(self):
     records = Observation.save(self.test_data['good_eats.csv'],
             self.dataset)
     cursor = Observation.find(self.dataset)
     records = [x for x in cursor]
     self.assertTrue(isinstance(records, list))
     self.assertTrue(isinstance(records[0], dict))
     self.assertTrue('_id' in records[0].keys())
     return records
Beispiel #4
0
 def test_find_as_df(self):
     self._save_observations()
     records = [x for x in Observation.find(self.dataset)]
     dframe = Observation.find(self.dataset, as_df=True)
     self.assertTrue(isinstance(dframe, DataFrame))
     self.assertEqual(self.test_data['good_eats.csv'].reindex(
                 columns=dframe.columns), dframe)
     columns = dframe.columns
     for key in MONGO_RESERVED_KEYS:
         self.assertFalse(prefix_reserved_key(key) in columns)
Beispiel #5
0
    def DELETE(self, dataset_id):
        """
        Delete observations (i.e. the dataset) with hash *dataset_id* from mongo
        """
        dataset = Dataset.find_one(dataset_id)
        result = None

        if dataset:
            Dataset.delete(dataset_id)
            Observation.delete(dataset)
            result = {SUCCESS: 'deleted dataset: %s' % dataset_id}
        return dump_or_error(result, 'id not found')
Beispiel #6
0
 def test_find_with_select_and_query(self):
     self._save_observations()
     cursor = Observation.find(self.dataset, '{"rating": "delectible"}',
             '{"rating": 1}')
     self.assertTrue(isinstance(cursor, Cursor))
     results = [row for row in cursor]
     self.assertEquals(sorted(results[0].keys()), ['_id', 'rating'])
Beispiel #7
0
    def observe(self, *, processes=1, period=1):
        log.info("Computing observations for campaign")
        self.observations = [
            Observation(self.dataset, observer, self.population, self.config)
            for observer in self.observers
        ]

        for observation in self.observations:
            observation.observe(processes=processes, period=period)
Beispiel #8
0
    def save(cls, dataset, formula, name, **kwargs):
        """
        Attempt to parse formula, then save formula, and add a task to calculate
        formula.
        """

        dframe = Observation.find(dataset, as_df=True)

        # attempt to get a row from the dataframe
        try:
            row = dframe.irow(0)
        except IndexError, err:
            row = {}
Beispiel #9
0
def calculate_column(dataset, dframe, formula, name):
    """
    For calculating new columns.
    Get necessary data given a calculation ID, execute calculation formula,
    store results in dataset the calculation refers to.
    """
    # parse formula into function and variables
    parser = Parser()
    func = parser.parse_formula(formula)

    new_column = dframe.apply(func, axis=1, args=(parser, ))
    new_column.name = name
    return Observation.update(dframe.join(new_column), dataset)
Beispiel #10
0
 def setUp(self):
     TestBase.setUp(self)
     self.dataset = Dataset.save(self.test_dataset_ids['good_eats.csv'])
     dframe = self.test_data['good_eats.csv']
     Dataset.build_schema(self.dataset, dframe.dtypes)
     Observation.save(dframe, self.dataset)
     self.calculations = [
         'rating',
         'gps',
         'amount + gps_alt',
         'amount - gps_alt',
         'amount + 5',
         'amount - gps_alt + 2.5',
         'amount * gps_alt',
         'amount / gps_alt',
         'amount * gps_alt / 2.5',
         'amount + gps_alt * gps_precision',
         '(amount + gps_alt) * gps_precision',
         'amount = 2',
         '10 < amount',
         '10 < amount + gps_alt',
         'not amount = 2',
         'not(amount = 2)',
         'amount = 2 and 10 < amount',
         'amount = 2 or 10 < amount',
         'not not amount = 2 or 10 < amount',
         'not amount = 2 or 10 < amount',
         '(not amount = 2) or 10 < amount',
         'not(amount = 2 or 10 < amount)',
         'amount ^ 3',
         '(amount + gps_alt) ^ 2 + 100',
         '-amount',
         '-amount < gps_alt - 100',
         'rating in ["delectible"]',
         'risk_factor in ["low_risk"]',
         'amount in ["9.0", "2.0", "20.0"]',
         '(risk_factor in ["low_risk"]) and (amount in ["9.0", "20.0"])',
     ]
     self.places = 5
Beispiel #11
0
    def monitor_observations(self):

        last_run = None

        while True:

            # If there is no previous run, just run through anyway
            curtime = datetime.datetime.now()
            if last_run is not None:
                if (curtime - last_run).total_seconds() < 60:
                    continue

            last_run = curtime

            # Prepare observation data for the previous minute
            minute_obs = (curtime - datetime.timedelta(minutes=1)).replace(second=0)
            minute_store = minute_obs + datetime.timedelta(minutes=1)

            wind_df = self.get_wind_df(minute=minute_obs)
            wind_direction = self.average_wind_direction(wind_df=wind_df)
            wind_speed = self.average_wind_speed(wind_df=wind_df)
            wind_gust = self.max_wind_gust(wind_df=wind_df)
            rain = self.current_rain(minute=minute_obs)

            obs = Observation(dt=minute_store,
                              wind_direction=wind_direction,
                              wind_speed=wind_speed,
                              wind_gust=wind_gust,
                              rain=rain)

            try:

                session = db.get_session()
                session.add(obs)
                session.commit()

            except Exception as e:

                print("Unable to add observation due to exception %s" % e)
Beispiel #12
0
    def GET(self, dataset_id, mode=False, query='{}', select=None,
            group=ALL):
        """
        Return data set for hash *dataset_id*.
        Execute query *query* in mongo if passed.
        If summary is passed return summary statistics for data set.
        If group is passed group the summary, if summary is false group is
        ignored.
        """
        dataset = Dataset.find_one(dataset_id)
        result = None

        try:
            if dataset:
                if mode == MODE_INFO:
                    result = Dataset.schema(dataset)
                elif mode == MODE_SUMMARY:
                    result = summarize(dataset, query, select, group)
                else:
                    return mongo_to_json(Observation.find(dataset, query,
                                select))
        except JSONError, e:
            result = {ERROR: e.__str__()}
Beispiel #13
0
def summarize(dataset, query, select, group):
    """
    Return a summary for the rows/values filtered by *query* and *select*
    and grouped by *group* or the overall summary if no group is specified.
    """
    # narrow list of observations via query/select
    dframe = Observation.find(dataset, query, select, as_df=True)

    # do not allow group by numeric types
    # TODO check schema for valid groupby columns once included
    _type = dframe.dtypes.get(group)
    if group != ALL and (_type is None or _type.type != np.object_):
        return {ERROR: "group: '%s' is not categorical." % group}

    # check cached stats for group and update as necessary
    stats = dataset.get(STATS, {})
    if not stats.get(group):
        stats = {ALL: summarize_df(dframe)} if group == ALL \
                else summarize_with_groups(dframe, stats, group)
        Dataset.update(dataset, {STATS: stats})
    stats_to_return = stats.get(group)

    return dict_from_mongo(stats_to_return if group == ALL else {group:
            stats_to_return})
Beispiel #14
0
    def _test_calculator(self, delay=True):
        dframe = Observation.find(self.dataset, as_df=True)

        columns = dframe.columns.tolist()
        start_num_cols = len(columns)
        added_num_cols = 0

        column_labels_to_slugs = build_labels_to_slugs(self.dataset)
        label_list, slugified_key_list = [list(ary) for ary in
                zip(*column_labels_to_slugs.items())]

        for idx, formula in enumerate(self.calculations):
            name = 'test-%s' % idx
            if delay:
                task = calculate_column.delay(self.dataset, dframe,
                        formula, name)
                # test that task has completed
                self.assertTrue(task.ready())
                self.assertTrue(task.successful())
            else:
                task = calculate_column(self.dataset, dframe,
                        formula, name)

            column_labels_to_slugs = build_labels_to_slugs(self.dataset)

            unslug_name = name
            name = column_labels_to_slugs[unslug_name]

            # test that updated dataframe persisted
            dframe = Observation.find(self.dataset, as_df=True)
            self.assertTrue(name in dframe.columns)

            # test new number of columns
            added_num_cols += 1
            self.assertEqual(start_num_cols + added_num_cols,
                    len(dframe.columns.tolist()))

            # test that the schema is up to date
            dataset = Dataset.find_one(self.dataset[DATASET_ID])
            self.assertTrue(SCHEMA in dataset.keys())
            self.assertTrue(isinstance(dataset[SCHEMA], dict))
            schema = dataset[SCHEMA]

            # test slugified column names
            slugified_key_list.append(name)
            self.assertEqual(sorted(schema.keys()), sorted(slugified_key_list))

            # test column labels
            label_list.append(unslug_name)
            labels = [schema[col][LABEL] for col in schema.keys()]
            self.assertEqual(sorted(labels), sorted(label_list))

            # test result of calculation
            formula = column_labels_to_slugs[formula]

            for idx, row in dframe.iterrows():
                try:
                    result = np.float64(row[name])
                    stored = np.float64(row[formula])
                    # np.nan != np.nan, continue if we have two nan values
                    if np.isnan(result) and np.isnan(stored):
                        continue
                    msg = self._equal_msg(result, stored, formula)
                    self.assertAlmostEqual(result, stored, self.places, msg)
                except ValueError:
                    msg = self._equal_msg(row[name], row[formula], formula)
                    self.assertEqual(row[name], row[formula], msg)
Beispiel #15
0
 def test_save_over_bulk(self):
     Observation.save(self.test_data['good_eats_large.csv'],
             self.dataset)
     cursor = Observation.find(self.dataset)
     records = [x for x in cursor]
     self.assertEqual(len(records), 1001)
Beispiel #16
0
 def test_find(self):
     self._save_observations()
     cursor = Observation.find(self.dataset)
     self.assertTrue(isinstance(cursor, Cursor))
Beispiel #17
0
 def test_find_with_query(self):
     self._save_observations()
     cursor = Observation.find(self.dataset, '{"rating": "delectible"}')
     self.assertTrue(isinstance(cursor, Cursor))
Beispiel #18
0
 def _save_observations(self):
     return Observation.save(self.test_data['good_eats.csv'], self.dataset)
Beispiel #19
0
 def _save_observations_and_calculation(self, formula=None):
     if not formula:
         formula = self.formula
     Observation.save(self.test_data['good_eats.csv'], self.dataset)
     return Calculation.save(self.dataset, formula, self.name)