Ejemplo n.º 1
0
    def generate_plans(self, matrix_set_definitions, feature_dictionaries):
        """Create build tasks and update the matrix definitions with UUIDs

        :param matrix_set_definitions: the temporal information needed to generate each matrix
        :param feature_dictionaries: combinations of features to include in matrices
        :type matrix_set_definitions: list
        :type feature_dictionaries: list

        :return: matrix set definitions (updated with matrix uuids) and build tasks
        :rtype: tuple (list, dict)
        """
        updated_definitions = []
        build_tasks = dict()
        for matrix_set in matrix_set_definitions:
            train_matrix = matrix_set['train_matrix']
            for label_name, label_type, state, feature_dictionary in itertools.product(
                    self.label_names, self.label_types, self.states,
                    feature_dictionaries):
                matrix_set_clone = copy.deepcopy(matrix_set)
                # get a uuid
                train_metadata = self._make_metadata(
                    train_matrix,
                    feature_dictionary,
                    label_name,
                    label_type,
                    state,
                    'train',
                )
                print(train_metadata)
                train_uuid = metta.generate_uuid(train_metadata)
                if train_uuid not in build_tasks:
                    build_tasks[train_uuid] = self._generate_build_task(
                        train_metadata, train_uuid, train_matrix,
                        feature_dictionary)
                matrix_set_clone['train_uuid'] = train_uuid

                test_uuids = []
                for test_matrix in matrix_set_clone['test_matrices']:
                    test_metadata = self._make_metadata(
                        test_matrix,
                        feature_dictionary,
                        label_name,
                        label_type,
                        state,
                        'test',
                    )
                    test_uuid = metta.generate_uuid(test_metadata)
                    if test_uuid not in build_tasks:
                        build_tasks[test_uuid] = self._generate_build_task(
                            test_metadata, test_uuid, test_matrix,
                            feature_dictionary)

                    test_uuids.append(test_uuid)
                matrix_set_clone['test_uuids'] = test_uuids
                updated_definitions.append(matrix_set_clone)

        return updated_definitions, build_tasks
Ejemplo n.º 2
0
    def test_test_matrix(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            create_schemas(engine=engine,
                           features_tables=features_tables,
                           labels=labels,
                           states=states)

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0)
            ]

            with TemporaryDirectory() as temp_dir:
                planner = Planner(feature_start_time=datetime.datetime(
                    2010, 1, 1, 0, 0),
                                  label_names=['booking'],
                                  label_types=['binary'],
                                  states=['state_one AND state_two'],
                                  db_config=db_config,
                                  matrix_directory=temp_dir,
                                  user_metadata={},
                                  engine=engine)

                matrix_dates = {
                    'first_as_of_time': datetime.datetime(2016, 1, 1, 0, 0),
                    'matrix_info_end_time':
                    datetime.datetime(2016, 3, 1, 0, 0),
                    'as_of_times': dates
                }
                feature_dictionary = {
                    'features0': ['f1', 'f2'],
                    'features1': ['f3', 'f4'],
                }
                matrix_metadata = {
                    'matrix_id': 'hi',
                    'state': 'state_one AND state_two',
                    'label_name': 'booking',
                    'end_time': datetime.datetime(2016, 3, 1, 0, 0),
                    'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0),
                    'label_timespan': '1 month'
                }
                uuid = metta.generate_uuid(matrix_metadata)
                planner.build_matrix(as_of_times=dates,
                                     label_name='booking',
                                     label_type='binary',
                                     feature_dictionary=feature_dictionary,
                                     matrix_directory=temp_dir,
                                     matrix_metadata=matrix_metadata,
                                     matrix_uuid=uuid,
                                     matrix_type='test')
                matrix_filename = os.path.join(temp_dir, '{}.csv'.format(uuid))

                with open(matrix_filename, 'r') as f:
                    reader = csv.reader(f)
                    assert (len([row for row in reader]) == 6)
Ejemplo n.º 3
0
    def test_train_matrix(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            create_schemas(
                engine=engine,
                features_tables=features_tables,
                labels=labels,
                states=states
            )

            dates = [datetime.datetime(2016, 1, 1, 0, 0),
                     datetime.datetime(2016, 2, 1, 0, 0),
                     datetime.datetime(2016, 3, 1, 0, 0)]

            with TemporaryDirectory() as temp_dir:
                planner = Planner(
                    beginning_of_time = datetime.datetime(2010, 1, 1, 0, 0),
                    label_names = ['booking'],
                    label_types = ['binary'],
                    states = ['state_one AND state_two'],
                    db_config = db_config,
                    matrix_directory = temp_dir,
                    user_metadata = {},
                    engine = engine
                )
                feature_dictionary = {
                    'features0': ['f1', 'f2'],
                    'features1': ['f3', 'f4'],
                }
                matrix_metadata = {
                    'matrix_id': 'hi',
                    'state': 'state_one AND state_two',
                    'label_name': 'booking',
                    'end_time': datetime.datetime(2016, 3, 1, 0, 0),
                    'beginning_of_time': datetime.datetime(2016, 1, 1, 0, 0),
                    'label_window': '1 month'
                }
                uuid = metta.generate_uuid(matrix_metadata)
                planner.build_matrix(
                    as_of_times = dates,
                    label_name = 'booking',
                    label_type = 'binary',
                    feature_dictionary = feature_dictionary,
                    matrix_directory = temp_dir,
                    matrix_metadata = matrix_metadata,
                    matrix_uuid = uuid,
                    matrix_type = 'train'
                )

                matrix_filename = os.path.join(
                    temp_dir,
                    '{}.csv'.format(uuid)
                )
                with open(matrix_filename, 'r') as f:
                    reader = csv.reader(f)
                    assert(len([row for row in reader]) == 6)
Ejemplo n.º 4
0
    def generate_plans(self, matrix_set_definitions, feature_dictionaries):
        """Create build tasks and update the matrix definitions with UUIDs

        :param matrix_set_definitions: the temporal information needed to generate each matrix
        :param feature_dictionaries: combinations of features to include in matrices
        :type matrix_set_definitions: list
        :type feature_dictionaries: list

        :return: matrix set definitions (updated with matrix uuids) and build tasks
        :rtype: tuple (list, dict)
        """
        updated_definitions = []
        build_tasks = dict()
        for matrix_set in matrix_set_definitions:
            train_matrix = matrix_set['train_matrix']
            for label_name, label_type, state, feature_dictionary in itertools.product(
                self.label_names,
                self.label_types,
                self.states,
                feature_dictionaries
            ):
                matrix_set_clone = copy.deepcopy(matrix_set)
                # get a uuid
                train_metadata = self._make_metadata(
                    train_matrix,
                    feature_dictionary,
                    label_name,
                    label_type,
                    state,
                    'train',
                )
                print(train_metadata)
                train_uuid = metta.generate_uuid(train_metadata)
                if train_uuid not in build_tasks:
                    build_tasks[train_uuid] = self._generate_build_task(
                        train_metadata,
                        train_uuid,
                        train_matrix,
                        feature_dictionary
                    )
                matrix_set_clone['train_uuid'] = train_uuid

                test_uuids = []
                for test_matrix in matrix_set_clone['test_matrices']:
                    test_metadata = self._make_metadata(
                        test_matrix,
                        feature_dictionary,
                        label_name,
                        label_type,
                        state,
                        'test',
                    )
                    test_uuid = metta.generate_uuid(test_metadata)
                    if test_uuid not in build_tasks:
                        build_tasks[test_uuid] = self._generate_build_task(
                            test_metadata,
                            test_uuid,
                            test_matrix,
                            feature_dictionary
                        )

                    test_uuids.append(test_uuid)
                matrix_set_clone['test_uuids'] = test_uuids
                updated_definitions.append(matrix_set_clone)

        return updated_definitions, build_tasks
Ejemplo n.º 5
0
    def generate_plans(self, matrix_set_definitions, feature_dictionaries):
        """Create build tasks and update the matrix definitions with UUIDs

        :param matrix_set_definitions: the temporal information needed to generate each matrix
        :param feature_dictionaries: combinations of features to include in matrices
        :type matrix_set_definitions: list
        :type feature_dictionaries: list

        :return: matrix set definitions (updated with matrix uuids) and build tasks
        :rtype: tuple (list, dict)
        """
        updated_definitions = []
        build_tasks = dict()
        for matrix_set in matrix_set_definitions:
            logging.info('Making plans for matrix set %s', matrix_set)
            logging.info(
                'Iterating over %s label names, %s label_types, %s states, %s feature dictionaries',
                len(self.label_names), len(self.label_types), len(self.states),
                len(feature_dictionaries))
            train_matrix = matrix_set['train_matrix']
            for label_name, label_type, state, feature_dictionary in itertools.product(
                    self.label_names, self.label_types, self.states,
                    feature_dictionaries):
                matrix_set_clone = copy.deepcopy(matrix_set)
                # get a uuid
                train_metadata = self._make_metadata(
                    train_matrix,
                    feature_dictionary,
                    label_name,
                    label_type,
                    state,
                    'train',
                )
                train_uuid = metta.generate_uuid(train_metadata)
                logging.info('Matrix UUID %s found for train metadata %s',
                             train_uuid, train_metadata)
                if train_uuid not in build_tasks:
                    build_tasks[train_uuid] = self._generate_build_task(
                        train_metadata, train_uuid, train_matrix,
                        feature_dictionary)
                    logging.info(
                        'Train uuid %s not found in build tasks yet, so added',
                        train_uuid)
                else:
                    logging.info('Train uuid %s already found in build tasks',
                                 train_uuid)
                matrix_set_clone['train_uuid'] = train_uuid

                test_uuids = []
                for test_matrix in matrix_set_clone['test_matrices']:
                    test_metadata = self._make_metadata(
                        test_matrix,
                        feature_dictionary,
                        label_name,
                        label_type,
                        state,
                        'test',
                    )
                    test_uuid = metta.generate_uuid(test_metadata)
                    logging.info('Matrix UUID %s found for test metadata %s',
                                 test_uuid, test_metadata)
                    if test_uuid not in build_tasks:
                        build_tasks[test_uuid] = self._generate_build_task(
                            test_metadata, test_uuid, test_matrix,
                            feature_dictionary)
                        logging.info(
                            'Test uuid %s not found in build tasks yet, so added',
                            test_uuid)
                    else:
                        logging.info(
                            'Test uuid %s already found in build tasks',
                            test_uuid)

                    test_uuids.append(test_uuid)
                matrix_set_clone['test_uuids'] = test_uuids
                updated_definitions.append(matrix_set_clone)

        logging.info(
            'Planner is finished generating matrix plans. %s matrix definitions and %s unique build tasks found',
            len(updated_definitions), len(build_tasks.keys()))
        return updated_definitions, build_tasks
Ejemplo n.º 6
0
    def test_nullcheck(self):
        f0_dict = {(r[0], r[1]): r for r in features0_pre}
        f1_dict = {(r[0], r[1]): r for r in features1_pre}

        features0 = sorted(f0_dict.values(), key=lambda x: (x[1], x[0]))
        features1 = sorted(f1_dict.values(), key=lambda x: (x[1], x[0]))

        features_tables = [features0, features1]

        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            create_schemas(engine=engine,
                           features_tables=features_tables,
                           labels=labels,
                           states=states)

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0)
            ]

            with TemporaryDirectory() as temp_dir:
                planner = Planner(feature_start_time=datetime.datetime(
                    2010, 1, 1, 0, 0),
                                  label_names=['booking'],
                                  label_types=['binary'],
                                  states=['state_one AND state_two'],
                                  db_config=db_config,
                                  matrix_directory=temp_dir,
                                  user_metadata={},
                                  engine=engine)

                matrix_dates = {
                    'first_as_of_time': datetime.datetime(2016, 1, 1, 0, 0),
                    'matrix_info_end_time':
                    datetime.datetime(2016, 3, 1, 0, 0),
                    'as_of_times': dates
                }
                feature_dictionary = {
                    'features0': ['f1', 'f2'],
                    'features1': ['f3', 'f4'],
                }
                matrix_metadata = {
                    'matrix_id': 'hi',
                    'state': 'state_one AND state_two',
                    'label_name': 'booking',
                    'end_time': datetime.datetime(2016, 3, 1, 0, 0),
                    'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0),
                    'label_timespan': '1 month'
                }
                uuid = metta.generate_uuid(matrix_metadata)
                with self.assertRaises(ValueError):
                    planner.build_matrix(as_of_times=dates,
                                         label_name='booking',
                                         label_type='binary',
                                         feature_dictionary=feature_dictionary,
                                         matrix_directory=temp_dir,
                                         matrix_metadata=matrix_metadata,
                                         matrix_uuid=uuid,
                                         matrix_type='test')