Ejemplo n.º 1
0
def load_data(
    input_dir=None,
    split='train',
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load data for a specific split

    If input_dir is not provided, loads X and y for the given split from the
    default location (S3). If input_dir is provided, loads the
    entities/targets tables from their default table names from the given
    directory, ignoring split.

    For feature development, only the train split should be used.
    """
    config = load_config()
    tables = config.data.tables
    entities_table_name = config.data.entities_table_name
    entities_config = some(where(tables, name=entities_table_name))
    targets_table_name = config.data.targets_table_name
    targets_config = some(where(tables, name=targets_table_name))

    if input_dir is None:
        bucket = config.data.s3_bucket
        split_path = config.data.splits.get(split)
        input_dir = f's3://{bucket}/{split_path}'

    X = load_table_from_config(input_dir, entities_config)
    y = load_table_from_config(input_dir, targets_config)

    return X, y
Ejemplo n.º 2
0
def buffer_logic(state):
    '''
    Buffer create/replace/reuse logic. The function name is not very good :(

       new_state    |   old_state   | same sources |     action
    ----------------|---------------|--------------|-----------------
        replace     |    replace    |    True      |  reuse buffer
        replace     |    replace    |    False     |  replace buffer
        replace     |   no-replace  |    True      |  create buffer (copy candidates)
        replace     |   no-replace  |    False     |  create buffer
       no-replace   |    replace    |    True      |  create buffer (copy candidates)
       no-replace   |    replace    |    False     |  create buffer
       no-replace   |   no-replace  |    True      |  reuse buffer
       no-replace   |   no-replace  |    False     |  create buffer

    A reusable buffer will be looked for, then a replacement buffer and as a
    last resort a new one will be created.

    Returns:
        old_state (dict): In case a state was reused/replaced it is returned
        because it will be needed later on to compare it with the current
        state and determine whether the window should be resized/moved, etc...
    '''
    # We are only interested in buffers which are in the same container.
    # That's where the interesting reuse/replace logic is at.
    states = fn.where(variables.states, container=state['container'])

    with_same_sources = partial(same_sources, state)

    reusable_state = fn.first(fn.where(
        ifilter(with_same_sources, states),
        replace = state['replace']
    ))

    replaceable_state = fn.first(fn.where(
        ifilter(lambda x: not with_same_sources(x), states),
        replace = True
    ))

    old_state = None

    if reusable_state:
        state.update(fn.project(reusable_state, ['uid', 'buffer', 'sources']))
        old_state = reusable_state
        variables.states.remove(reusable_state)

    elif replaceable_state:
        state.update(fn.project(replaceable_state, ['uid', 'buffer']))
        state['sources'] = populated_candidates(state)
        set_buffer_contents(state['buffer'], aggregate_candidates(state))
        old_state = replaceable_state
        variables.states.remove(replaceable_state)

    else:
        same = find(with_same_sources, states)
        state['sources'] = (same and same['sources']) or populated_candidates(state)
        state['buffer'] = make_pyunite_buffer(state)

    return old_state
Ejemplo n.º 3
0
 def chose_ware_ids_with_requirement(wares, categories, moderation, stock):
     """ Делаем пересечение условия для существующих товаров.
     :param wares: список товаров
     :param categories: список категорий
     :param moderation: модерирование
     :param stock: сток
     :return: список товаров с пересечениями
     """
     # делаем пересечение условий
     w_c = [funky.pluck(funcy.where(wares, managed_category=x), 'ware_id') for x in categories]
     w_m = [funky.pluck(funcy.where(wares, moderation_state=x), 'ware_id') for x in moderation]
     w_s = [funky.pluck(funcy.where(wares, stock_state=x), 'ware_id') for x in stock]
     return intersection_lists(w_c + w_m + w_s)
Ejemplo n.º 4
0
 def get_static_user(user_id):
     """ Получить данные статичного пользователя для прода по идентификатору.
     :param user_id: идентификатор пользователя
     :return: список словарей
     """
     from funcy import where
     return where(MainClass.STATIC_USERS, user_id=user_id)
Ejemplo n.º 5
0
def validate_semantics_of_table(table_design):
    """Check for semantics that apply to tables in source schemas."""
    validate_semantics_of_table_or_ctas(table_design)

    if "depends_on" in table_design:
        raise TableDesignSemanticError(
            "upstream table '%s' has dependencies listed" %
            table_design["name"])

    constraints = table_design.get("constraints", [])
    constraint_types_in_design = [
        constraint_type for constraint in constraints
        for constraint_type in constraint
    ]
    for constraint_type in constraint_types_in_design:
        if constraint_type in ("natural_key", "surrogate_key"):
            raise TableDesignSemanticError(
                "upstream table '{}' has unexpected {} constraint".format(
                    table_design["name"], constraint_type))

    [split_by_name] = table_design.get("extract_settings",
                                       {}).get("split_by", [None])
    if split_by_name:
        split_by_column = fy.first(
            fy.where(table_design["columns"], name=split_by_name))
        if split_by_column.get("skipped", False):
            raise TableDesignSemanticError(
                "split-by column must not be skipped")
        if not split_by_column.get("not_null", False):
            raise TableDesignSemanticError(
                "split-by column must have not-null constraint")
        if split_by_column["type"] not in ("int", "long", "date", "timestamp"):
            raise TableDesignSemanticError(
                "type of split-by column must be int, long, date or timestamp, not '{}'"
                .format(split_by_column["type"]))
Ejemplo n.º 6
0
    def get_tests(self, file_name, source_data):
        """

        :param file_name:
        :param source_data:
        :return:
        """
        tests_array = funcy.where(source_data, code_name_file=file_name)
        classes_name = set([(index['code_name_class'], str(index['story'])) for index in tests_array])
        classes_array = list()
        for c_name in classes_name:
            #tests_for_class = funcy.where(source_data, code_name_class=c_name)
            tests_for_class = list()
            for t in tests_array:
                if t['code_name_class'] == c_name[0]:
                    p = {"name": t['name'],
                         "code_name": t["code_name_class"],
                         "description": t["description"],
                         "skip": t["skip"],
                         "priority": t["priority"]}
                    tests_for_class.append(p)
            classes_array.append({"name": c_name[1].decode('utf-8'),
                                  "code_name": c_name[0],
                                  "tests": tests_for_class})
        return {file_name: classes_array}
Ejemplo n.º 7
0
 def get_static_user_by_role(role):
     """ Получить данные статичного пользователя для прода по его роли.
     :param role: роль пользователя
     :return: список словарей
     """
     from funcy import where
     return where(MainClass.STATIC_USERS, role=role)
Ejemplo n.º 8
0
    def build_sqoop_partition_options(self, relation: RelationDescription,
                                      partition_key: Optional[str],
                                      table_size: int) -> List[str]:
        """
        Build the partitioning-related arguments for Sqoop.
        """
        if partition_key:
            column = fy.first(
                fy.where(relation.table_design["columns"], name=partition_key))
            if column["sql_type"] in ("timestamp",
                                      "timestamp without time zone"):
                quoted_key_arg = """CAST(TO_CHAR("{}", 'YYYYMMDDHH24MISS') AS BIGINT)""".format(
                    partition_key)
            else:
                quoted_key_arg = '"{}"'.format(partition_key)

            if relation.num_partitions:
                # num_partitions explicitly set in the design file overrides the dynamic determination.
                num_mappers = min(relation.num_partitions, self.max_partitions)
            else:
                num_mappers = self.maximize_partitions(table_size)

            if num_mappers > 1:
                return [
                    "--split-by", quoted_key_arg, "--num-mappers",
                    str(num_mappers)
                ]

        # Use 1 mapper if either there is no partition key, or if the partitioner returns only one partition
        return ["--num-mappers", "1"]
Ejemplo n.º 9
0
def load_data(input_dir=None):
    """Load data"""
    if input_dir is not None:
        tables = config.get('data.tables')

        entities_table_name = config.get('data.entities_table_name')
        entities_config = some(where(tables, name=entities_table_name))
        X = load_table_from_config(input_dir, entities_config)

        targets_table_name = config.get('data.targets_table_name')
        targets_config = some(where(tables, name=targets_table_name))
        y = load_table_from_config(input_dir, targets_config)
    else:
        raise NotImplementedError

    return X, y
Ejemplo n.º 10
0
def load_data(input_dir=None):
    """Load data"""
    if input_dir is not None:
        tables = config.get('data.tables')

        entities_table_name = config.get('data.entities_table_name')
        entities_config = some(where(tables, name=entities_table_name))
        X = load_table_from_config(input_dir, entities_config)

        targets_table_name = config.get('data.targets_table_name')
        targets_config = some(where(tables, name=targets_table_name))
        y = load_table_from_config(input_dir, targets_config)
    else:
        root = 'https://mit-dai-ballet.s3.amazonaws.com/census'
        X = pd.read_csv(root + '/train/entities.csv.gz')
        y = pd.read_csv(root + '/train/targets.csv.gz')

    return X, y
Ejemplo n.º 11
0
def load_data(input_dir=None):
    """Load data"""
    if input_dir is not None:
        tables = conf.get("tables")

        entities_table_name = conf.get("data", "entities_table_name")
        entities_config = some(where(tables, name=entities_table_name))
        X_df = load_table_from_config(input_dir, entities_config)

        targets_table_name = conf.get("data", "targets_table_name")
        targets_config = some(where(tables, name=targets_table_name))
        y_df = load_table_from_config(input_dir, targets_config)
    else:
        source = "https://s3.amazonaws.com/mit-dai-ballet/ames/AmesHousing.txt"
        df = pd.read_csv(source, sep="\t")
        X_df = df.drop("SalePrice", axis=1)
        y_df = df["SalePrice"]

    return X_df, y_df
Ejemplo n.º 12
0
def load_data(split='train', input_dir=None):
    """Load data

    If input dir is not None, then load whatever dataset appears in
    `input_dir`. Otherwise, load the data split indicated by `split`.
    """
    if input_dir is not None:
        config = load_config()
        tables = config.data.tables

        entities_table_name = config.data.entities_table_name
        entities_config = some(where(tables, name=entities_table_name))
        X = load_table_from_config(input_dir, entities_config)

        targets_table_name = config.data.targets_table_name
        targets_config = some(where(tables, name=targets_table_name))
        y = load_table_from_config(input_dir, targets_config)
        return X, y

    raise NotImplementedError
Ejemplo n.º 13
0
    def find_partition_key(self) -> Union[str, None]:
        """
        Return valid partition key for a relation.

        The partition key will fulfill these conditions:
        (1) the column is marked as a primary key
        (2) the table's primary key is a single column
        (3) the column has a numeric type or can be cast into one (which currently only works for
            timestamps).

        If the table design provides extract_settings with a split_by column setting, provide that
        instead. The column will be numeric (int or long) or a timestamp in this case.

        If no partition key can be found, returns None.
        """
        constraints = self.table_design.get("constraints", [])
        extract_settings = self.table_design.get("extract_settings", {})
        [partition_key] = extract_settings.get("split_by", [None])

        if not partition_key:
            try:
                # Unpacking will fail here if the list of primary keys hasn't exactly one element.
                [primary_key] = [
                    col for constraint in constraints
                    for col in constraint.get("primary_key", [])
                ]
                partition_key = primary_key
            except ValueError:
                logger.debug(
                    "Found no single-column primary key for table '%s'",
                    self.identifier)

        if not partition_key:
            logger.debug("Found no partition key for table '%s'",
                         self.identifier)
            return None

        column = fy.first(
            fy.where(self.table_design["columns"], name=partition_key))

        # We check here the "generic" type which abstracts the SQL types like smallint, int4, etc.
        if column["type"] in ("int", "long", "date", "timestamp"):
            logger.debug("Partition key for table '%s' is '%s'",
                         self.identifier, partition_key)
            return partition_key

        logger.warning(
            "Column '%s' is not int, long, date or timestamp so is not usable as a partition key for '%s'",
            partition_key,
            self.identifier,
        )
        return None
Ejemplo n.º 14
0
 def check_fav_ware_from_db(self, before_fav_wares, after_fav_wares):
     """ Сравнить на идентичность избранных товаров до и после какого-то действия.
     :param before_fav_wares: список словарей до какого-то действия
     :param after_fav_wares: список словарей после какого-то действия
     """
     self.assertEqual(len(before_fav_wares), len(after_fav_wares), "Changed the number of favorite user.")
     for user in before_fav_wares:
         fav_data = funcy.where(after_fav_wares, fav_ware_id=user["fav_ware_id"])
         fav_elem = fav_data[0]
         self.assertEqual(len(fav_data), 1)
         self.assertEqual(fav_elem["fav_ware_id"], user["fav_ware_id"])
         self.assertEqual(fav_elem["user_id"], user["user_id"])
         self.assertEqual(fav_elem["creation_timestamp"], user["creation_timestamp"])
Ejemplo n.º 15
0
 def check_wares(self, wares_cassandra, wares_worker):
     """ Сравниваем списки товаров от Cassandra и от Warehouse.
     Сраниваем общее количество товаров.
     Сравниваем каждый товар по отдельности.
     :param wares_cassandra: список с данными от Warehouse
     :param wares_worker:  список с данными от Cassandra
     """
     service_log.put("Check lists from BD and Warehouse.")
     self.assertEqual(len(wares_worker), len(wares_cassandra), "The quantity of the wares does not match.")
     for ware_worker in wares_worker:
         service_log.put("Get ware in list: %s" % str(ware_worker))
         ware_cassandra = funcy.where(wares_cassandra, ware_id=ware_worker.wareId)
         self.assertEqual(len(ware_cassandra), 1, "Found several ware with one id.")
         self.assertNotEqual(len(ware_cassandra), 0, "Not found ware in data from worker Warehouse.")
         self.check_ware(ware_worker, ware_cassandra[0])
Ejemplo n.º 16
0
 def test_findUserDetails_for_exist_part_phone_for_all_users(self, iteration=None):
     """ Тестирование работы метода findUserDetails на существующем пользователе.
     Выбираем существующего пользователя и берём часть его номера телефона.
     Делаем выборку всех пользователей у которых совпадает часть номера телефона.
     Выборка части телефона производиться произвольным образом, поэтому делаем несколько итераций теста.
     """
     service_log.run(self)
     part_phone = self.user["phone"][:random.randint(1, len(self.user["phone"]))]
     self.assertNotEqual(len(self.user["phone"]), 0, "Find user without phone!!!")
     service_log.put("Get part phone user's: %s" % part_phone)
     users_with_part_phone = databases.db1.accounting.get_users_by_part_phone(part_phone)
     result = services.accounting.root.tframed.findUserDetails(self.get_FindUserRequestDto(part_phone))
     service_log.put("Method findUserDetails returned result: %s" % result)
     self.assertEqual(len(result), len(users_with_part_phone), "Does not match number of detected users.")
     for index in result:
         user = funcy.where(users_with_part_phone, id=index.userId)[0]
         self.check_user(index, user)
Ejemplo n.º 17
0
    def test_updateWares(self):
        """ Импорт премодерированного товара
        Товар проходит базовую валидацию, переводится в указанное состояние
        и помечается как отмодерированный.
        """
        service_log.run(self)

        # сохраняем первоночальные данные по товарам
        self.save_wares_data(self.list_wares)
        service_log.put("Save the data in a several product.")

        # берём произвольно товары № 2

        self.list_wares2 = list()
        for index in range(self.count_wares):
            self.list_wares2.append(self.get_random_ware(self.wares))
        service_log.put("Get list2 with Ware: %s" % self.list_wares2)

        # Создаём запрос для обновления товара №1 значениями от товара № 2. Отправляем его на сервис.
        wares_req = list()
        for num, index in enumerate(self.list_wares2):
            ware1 = self.list_wares[num]
            wares_req.append(self.req_update_ware(ware1["ware_id"], index['managed_category_id'], index["content"]))
        wares_warehouse = services.warehouse.root.tframed.updateWares(wares_req)
        service_log.put("Updated wares: %s" % str(wares_warehouse))

        for ware in wares_warehouse:
            # Возьмём значение из БД только что обновлённого товара №1 по его идентификатору
            ware_cassandra = databases.db1.warehouse.get_wares_by_ware_id(ware.wareId)
            service_log.put("Ware from BD: %s" % ware_cassandra)

            # проверяем, что вернулось только один товар
            self.assertEqual(len(ware_cassandra), 1, "Found more than one item.")
            ware_cassandra = ware_cassandra[0]
            # десериализуем и обновляем контент
            self.update_data_content(ware_cassandra, self.deserialize_content(ware_cassandra['content']))

            # проверяем, что идентификаторы товара остались прежними
            ware_in_list_wares = funcy.where(self.list_wares, ware_id=ware.wareId)[0]
            self.assertEqual(ware.wareId, ware_in_list_wares["ware_id"], "Do not match the identifiers of the ware.")

            #  проверяем полученное значение от сервиса со значениями из БД
            self.check_ware(ware_worker=ware, ware_dbase=ware_cassandra)
Ejemplo n.º 18
0
    def get_suits(source_data):
        """ Группируем по сьютам.
        :param source_data: список словарей всех тестов.
        :return: список папок с файлами и описанием.
        """

        folders_code_name = set([index['code_name_folder'] for index in source_data])  # выборка папок
        folders_array = list()
        for folder in folders_code_name:
            sorting_files = funcy.where(source_data, code_name_folder=folder)
            # выборка файлов для конкретной папки
            f_code_name = set([(index['code_name_file'], index['feature']) for index in sorting_files])
            pages = list()
            for file_array in f_code_name:
                code_name = file_array[0].decode('utf-8')
                name = str(file_array[1]).decode('utf-8')
                pages.append({"code_name": code_name, "name": name})
            folders_array.append({"name": sorting_files[0]['folder_title'], "code_name": folder, "pages": pages})
        return folders_array
Ejemplo n.º 19
0
    def test_add_fav_users(self, limit_users=5):
        """ Проверка добавления товаров в Избранное через метод addFavorites.
        """
        service_log.run(self)

        before_fav_users = databases.db1.favorites.get_fav_wares_by_user_id(self.user_id)

        users = databases.db3.accounting.get_users(limit=limit_users)
        user_ids = [index["id"] for index in users]
        dto_list = self.generate_dto_list_equal_fav_user(self.user_id, user_ids)
        param = self.get_FavoritesAddRequest(dto_list)
        result = services.favorites.root.tframed.addFavorites(param)

        after_fav_users = databases.db1.favorites.get_fav_users_by_user_id(self.user_id)

        self.assertIsNone(before_fav_users)
        self.assertEqual(len(result.dtoList), limit_users, "Does not match the number of elements.")
        for index in result.dtoList:
            fav_data = funcy.where(after_fav_users, fav_usr_id=index.content.favUserId)
            self.check_fav_user(user_id=self.user_id, data=index, fav_data=fav_data, fav_type=self.fav_type_user)
Ejemplo n.º 20
0
    def test_find_fav_users_for_user(self, limit_users=5):
        """ Проверка выборки избранных пользователей у пользователя.
        :param limit_users: количество добавляемых пользователей для проверки
        """
        service_log.run(self)

        users = databases.db3.accounting.get_users(limit=limit_users)
        user_ids = [index["id"] for index in users]
        dto_list = self.generate_dto_list_equal_fav_user(self.user_id, user_ids)
        add_fav_param = self.get_FavoritesAddRequest(dto_list)
        services.favorites.root.tframed.addFavorites(add_fav_param)

        param = self.get_UsersFavoritesRequest(user_id=self.user_id, fav_type=self.fav_type_user)
        result = services.favorites.root.tframed.findUsersFavoritesByParams(param)
        f_wares = databases.db1.favorites.get_fav_users_by_user_id(self.user_id)

        self.assertEqual(len(f_wares), len(result.dtoList), "Different length of lists.")
        self.assertEqual(len(f_wares), result.totalCount, "Wrong value totalCount.")

        for user in result.dtoList:
            fav_data = funcy.where(f_wares, fav_usr_id=user.content.favUserId)
            self.check_fav_user(user_id=self.user_id, data=user, fav_data=fav_data, fav_type=self.fav_type_user)
Ejemplo n.º 21
0
    def test_add_fav_wares(self, limit_wares=5):
        """ Проверка добавления товаров в Избранное через метод addFavorites.
        """
        service_log.run(self)

        before_fav_wares = databases.db1.favorites.get_fav_wares_by_user_id(self.user_id)

        wares = databases.db2.warehouse.get_wares_with_limit(limit=limit_wares)
        ware_ids = [index["ware_id"] for index in wares]
        dto_list = self.generate_dto_list_equal_fav_ware(self.user_id, ware_ids)
        param = self.get_FavoritesAddRequest(dto_list)
        result = services.favorites.root.tframed.addFavorites(param)

        after_fav_wares = databases.db1.favorites.get_fav_wares_by_user_id(self.user_id)

        self.assertIsNone(before_fav_wares)
        self.assertEqual(len(result.dtoList), limit_wares, "Does not match the number of elements.")
        for index in result.dtoList:
            fav_data = funcy.where(after_fav_wares, fav_ware_id=index.content.favWareId)
            self.check_fav_ware(user_id=self.user_id, data=index, fav_data=fav_data, fav_type=self.fav_type_ware)
            # TODO: в одном случае 0, в другом None - баг: https://jira.oorraa.net/browse/RT-786
            self.assertEqual(index.content.favUserId, 0, "Is not None value favorite users.")
Ejemplo n.º 22
0
    def test_find_fav_wares_for_user(self, limit_wares=5):
        """ Проверка выборки избранных товаров пользователя.
        :param limit_wares: количество добавляемых товаров для проверки
        """
        service_log.run(self)

        wares = databases.db2.warehouse.get_wares_with_limit(limit=limit_wares)
        ware_ids = [index["ware_id"] for index in wares]
        dto_list = self.generate_dto_list_equal_fav_ware(self.user_id, ware_ids)
        add_fav_param = self.get_FavoritesAddRequest(dto_list)
        services.favorites.root.tframed.addFavorites(add_fav_param)

        param = self.get_UsersFavoritesRequest(user_id=self.user_id, fav_type=self.fav_type_ware)
        result = services.favorites.root.tframed.findUsersFavoritesByParams(param)
        f_wares = databases.db1.favorites.get_fav_wares_by_user_id(self.user_id)

        self.assertEqual(len(f_wares), len(result.dtoList), "Different length of lists.")
        self.assertEqual(len(f_wares), result.totalCount, "Wrong value totalCount.")

        for ware in result.dtoList:
            fav_data = funcy.where(f_wares, fav_ware_id=ware.content.favWareId)
            self.check_fav_ware(user_id=self.user_id, data=ware, fav_data=fav_data, fav_type=self.fav_type_ware)
            self.assertIsNone(ware.content.favUserId, "Is not None value favorite users.")
Ejemplo n.º 23
0
def extract_errors(transcoder_job: dict):
    from funcy import merge, where, lpluck
    job = transcoder_job['Job']
    outputs = merge(job['Outputs'], job['Playlists'])
    return lpluck('StatusDetail', where(outputs, Status='Error'))
Ejemplo n.º 24
0
def snapshot_file(request, snap_id, format):
    snap = get_object_or_404(Snapshot, pk=snap_id)
    f = first(where(snap.files, format=format))
    return redirect(f.url)