Beispiel #1
0
    def __parse_html(self, table):
        header_list = []
        data_matrix = []

        self.__parse_tag_id(table)

        row_list = table.find_all("tr")
        re_table_val = re.compile("td|th")
        for row in row_list:
            td_list = row.find_all("td")
            if typepy.is_empty_sequence(td_list):
                if typepy.is_not_empty_sequence(header_list):
                    continue

                th_list = row.find_all("th")
                if typepy.is_empty_sequence(th_list):
                    continue

                header_list = [row.text.strip() for row in th_list]
                continue

            data_matrix.append([value.get_text().strip() for value in row.find_all(re_table_val)])

        if typepy.is_empty_sequence(data_matrix):
            raise ValueError("data matrix is empty")

        self._loader.inc_table_count()

        return TableData(
            self._make_table_name(),
            header_list,
            data_matrix,
            dp_extractor=self._loader.dp_extractor,
        )
Beispiel #2
0
    def asdict(self):
        """
        :return: Table data as a |dict| instance.
        :rtype: dict
        """

        dp_extractor = dp.DataPropertyExtractor()
        dp_extractor.strip_str = '"'
        dp_extractor.float_type = float

        dict_body = []
        for value_list in self.value_matrix:
            if typepy.is_empty_sequence(value_list):
                continue

            dict_record = [
                (header, dp_extractor.to_dataproperty(value).data)
                for header, value in zip(self.header_list, value_list)
                if value is not None
            ]

            if typepy.is_empty_sequence(dict_record):
                continue

            dict_body.append(dict(dict_record))

        return {self.table_name: dict_body}
Beispiel #3
0
def url(ctx, url, format_name, encoding, proxy):
    """
    Scrape tabular data from a URL and convert data to a SQLite database file.
    """

    if typepy.is_empty_sequence(url):
        sys.exit(ExitCode.NO_INPUT)

    logger = make_logger("{:s} url".format(PROGRAM_NAME), ctx.obj[Context.LOG_LEVEL])

    if typepy.is_empty_sequence(encoding):
        encoding = app_config_manager.load().get(ConfigKey.DEFAULT_ENCODING)
        logger.debug("use default encoding: {}".format(encoding))

    if typepy.is_null_string(proxy):
        proxy = app_config_manager.load().get(ConfigKey.PROXY_SERVER)

    con, is_create_db = create_database(ctx.obj[Context.OUTPUT_PATH], ctx.obj[Context.DUP_DATABASE])
    converter = UrlConverter(
        logger=logger,
        con=con,
        symbol_replace_value=ctx.obj[Context.SYMBOL_REPLACE_VALUE],
        index_list=ctx.obj.get(Context.INDEX_LIST),
        verbosity_level=ctx.obj.get(Context.VERBOSITY_LEVEL),
        format_name=format_name,
        encoding=encoding,
        proxy=proxy,
    )

    converter.convert(url)

    sys.exit(finalize(con, converter, is_create_db))
def url(ctx, url, format_name, encoding, proxy):
    """
    Scrape tabular data from a URL and convert data to a SQLite database file.
    """

    if typepy.is_empty_sequence(url):
        sys.exit(ExitCode.NO_INPUT)

    initialize_log_handler(ctx.obj[Context.LOG_LEVEL])
    logger = make_logger("{:s} url".format(PROGRAM_NAME),
                         ctx.obj[Context.LOG_LEVEL])

    try:
        app_configs = app_config_mgr.load()
    except ValueError as e:
        logger.debug(msgfy.to_debug_message(e))
        app_configs = {}

    if typepy.is_empty_sequence(encoding):
        encoding = app_configs.get(ConfigKey.DEFAULT_ENCODING)
        logger.debug("use default encoding: {}".format(encoding))

    if typepy.is_null_string(proxy):
        proxy = app_configs.get(ConfigKey.PROXY_SERVER)

    convert_configs = load_convert_config(logger,
                                          ctx.obj[Context.CONVERT_CONFIG],
                                          subcommand="url")

    con, is_create_db = create_database(ctx.obj[Context.OUTPUT_PATH],
                                        ctx.obj[Context.DUP_DATABASE])
    converter = UrlConverter(
        logger=logger,
        con=con,
        symbol_replace_value=ctx.obj[Context.SYMBOL_REPLACE_VALUE],
        add_pri_key_name=ctx.obj[Context.ADD_PRIMARY_KEY_NAME],
        convert_configs=convert_configs,
        index_list=ctx.obj.get(Context.INDEX_LIST),
        is_type_inference=ctx.obj[Context.TYPE_INFERENCE],
        is_type_hint_header=ctx.obj[Context.TYPE_HINT_HEADER],
        verbosity_level=ctx.obj.get(Context.VERBOSITY_LEVEL),
        format_name=format_name,
        encoding=encoding,
        proxy=proxy,
    )

    converter.convert(url)

    sys.exit(finalize(con, converter, is_create_db))
Beispiel #5
0
    def __create_table_from_tabledata(self, tabledata, index_attr_list=None):

        self.validate_access_permission(["w", "a"])
        validate_table_name(tabledata.table_name)

        logger.debug(
            "__create_table_from_tabledata: table={}, headers={}".format(
                tabledata.table_name, tabledata.header_list))

        attr_name_list = self.__sanitize_attr_name_list(tabledata.header_list)
        try:
            self.__validate_attr_name_list(attr_name_list)
        except pathvalidate.ReservedNameError:
            pass

        if typepy.is_empty_sequence(tabledata.value_matrix):
            raise ValueError("input data is null: '{} ({})'".format(
                tabledata.table_name, ", ".join(attr_name_list)))

        self.__verify_value_matrix(attr_name_list, tabledata.value_matrix)

        self.create_table(
            tabledata.table_name,
            self.__get_attr_desc_list(attr_name_list, tabledata.value_matrix))
        self.insert_many(tabledata.table_name, tabledata.value_matrix)
        if typepy.is_not_empty_sequence(index_attr_list):
            self.create_index_list(
                tabledata.table_name,
                self.__sanitize_attr_name_list(index_attr_list))
        self.commit()
    def _to_dp_list(self,
                    data_list,
                    type_hint=None,
                    strip_str=None,
                    strict_type_mapping=None):
        from collections import Counter
        from typepy import StrictLevel

        if is_empty_sequence(data_list):
            return []

        type_counter = Counter()

        dp_list = []
        for data in data_list:
            expect_type_hist = type_hint
            if type_hint is None:
                try:
                    expect_type_hist, _count = type_counter.most_common(1)[0]
                    if not expect_type_hist(
                            data, strict_level=StrictLevel.MAX).is_type():
                        expect_type_hist = None
                except IndexError:
                    pass

            dataprop = self.__to_dp(data=data,
                                    type_hint=expect_type_hist,
                                    strip_str=strip_str,
                                    strict_type_mapping=strict_type_mapping)
            type_counter[dataprop.type_class] += 1

            dp_list.append(dataprop)

        return dp_list
Beispiel #7
0
    def __strip_empty_col(self):
        from simplesqlite import connect_memdb
        from simplesqlite.query import Attr, AttrList

        con = connect_memdb()

        tmp_table_name = "tmp"
        headers = ["a{:d}".format(i) for i in range(len(self.__all_values[0]))]
        con.create_table_from_data_matrix(tmp_table_name, headers,
                                          self.__all_values)
        for col_idx, header in enumerate(headers):
            result = con.select(select=Attr(header), table_name=tmp_table_name)
            if any([
                    typepy.is_not_null_string(record[0])
                    for record in result.fetchall()
            ]):
                break

        strip_headers = headers[col_idx:]
        if typepy.is_empty_sequence(strip_headers):
            raise ValueError()

        result = con.select(select=AttrList(strip_headers),
                            table_name=tmp_table_name)
        self.__all_values = result.fetchall()
Beispiel #8
0
    def to_dp_list(self, values):
        if is_empty_sequence(values):
            return []

        self.__update_dp_converter()

        return self._to_dp_list(values, strip_str=self.strip_str_value)
Beispiel #9
0
    def _write_header(self):
        if not self.is_write_header or typepy.is_empty_sequence(
                self.header_list):
            return

        for col, value in enumerate(self.header_list):
            self.stream.write(self.first_header_row, col, value)
    def _preprocess_table_dp(self):
        if self._is_complete_table_dp_preprocess:
            return

        self._logger.logger.debug("_preprocess_table_dp")

        if typepy.is_empty_sequence(self.header_list) and self._use_default_header:
            self.header_list = [
                convert_idx_to_alphabet(col_idx)
                for col_idx in range(len(self.__value_matrix_org[0]))
            ]

        try:
            self._table_value_dp_matrix = self._dp_extractor.to_dp_matrix(
                to_value_matrix(self.header_list, self.__value_matrix_org)
            )
        except TypeError as e:
            self._logger.logger.debug(msgfy.to_error_message(e))
            self._table_value_dp_matrix = []

        self._column_dp_list = self._dp_extractor.to_column_dp_list(
            self._table_value_dp_matrix, self._column_dp_list
        )

        self._is_complete_table_dp_preprocess = True
Beispiel #11
0
    def write_table(self):
        """
        |write_table| with
        `Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__ format.
        Invalid characters in labels/data are removed.

        :raises pytablewriter.EmptyHeaderError: If the |headers| is empty.
        :Example:
            :ref:`example-ltsv-table-writer`
        """

        with self._logger:
            self._verify_property()
            self._preprocess()

            for values in self._table_value_matrix:
                ltsv_item_list = [
                    "{:s}:{}".format(pathvalidate.sanitize_ltsv_label(header_name), value)
                    for header_name, value in zip(self.headers, values)
                    if typepy.is_not_null_string(value)
                ]

                if typepy.is_empty_sequence(ltsv_item_list):
                    continue

                self._write_line("\t".join(ltsv_item_list))
Beispiel #12
0
    def write_table(self):
        """
        |write_table| with
        `Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__ format.
        Invalid characters in labels/data are removed.

        :raises pytablewriter.EmptyHeaderError: If the |header_list| is empty.
        :Example:
            :ref:`example-ltsv-table-writer`
        """

        with self._logger:
            self._verify_property()
            self._preprocess()

            for value_list in self._table_value_matrix:
                ltsv_item_list = [
                    "{:s}:{}".format(pathvalidate.sanitize_ltsv_label(header_name), value)
                    for header_name, value in zip(self.header_list, value_list)
                    if typepy.is_not_null_string(value)
                ]

                if typepy.is_empty_sequence(ltsv_item_list):
                    continue

                self._write_line("\t".join(ltsv_item_list))
Beispiel #13
0
    def _write_row(self, row: int, values: Sequence[str]) -> None:
        if typepy.is_empty_sequence(values):
            return

        col_delimiters = ([
            self.__to_column_delimiter(
                row,
                None,
                self._column_dp_list[0],
                self.char_left_side_row,
            )
        ] + [
            self.__to_column_delimiter(
                row,
                self._column_dp_list[col_idx],
                self._column_dp_list[col_idx + 1],
                self.column_delimiter,
            ) for col_idx in range(len(self._column_dp_list) - 1)
        ] + [
            self.__to_column_delimiter(
                row,
                self._column_dp_list[-1],
                None,
                self.char_right_side_row,
            )
        ])

        row_items = [""] * (len(col_delimiters) + len(values))
        row_items[::2] = col_delimiters
        row_items[1::2] = list(values)

        self._write_line("".join(chain.from_iterable(row_items)))
Beispiel #14
0
    def is_empty_header(self):
        """
        :return: |True| if the data :py:attr:`.header_list` is empty.
        :rtype: bool
        """

        return typepy.is_empty_sequence(self.header_list)
Beispiel #15
0
    def __create_table_from_tabledata(
            self, tabledata, index_attr_list=None):

        self.validate_access_permission(["w", "a"])
        validate_table_name(tabledata.table_name)

        logger.debug(
            "__create_table_from_tabledata: table={}, headers={}".format(
                tabledata.table_name, tabledata.header_list))

        attr_name_list = self.__sanitize_attr_name_list(tabledata.header_list)
        try:
            self.__validate_attr_name_list(attr_name_list)
        except pathvalidate.ReservedNameError:
            pass

        if typepy.is_empty_sequence(tabledata.value_matrix):
            raise ValueError("input data is null: '{} ({})'".format(
                tabledata.table_name, ", ".join(attr_name_list)))

        self.__verify_value_matrix(attr_name_list, tabledata.value_matrix)

        self.create_table(
            tabledata.table_name,
            self.__get_attr_desc_list(
                attr_name_list, tabledata.value_matrix))
        self.insert_many(tabledata.table_name, tabledata.value_matrix)
        if typepy.is_not_empty_sequence(index_attr_list):
            self.create_index_list(
                tabledata.table_name,
                self.__sanitize_attr_name_list(index_attr_list))
        self.commit()
    def _validate_empty_header(self):
        """
        :raises pytablewriter.EmptyHeaderError: If the |header_list| is empty.
        """

        if typepy.is_empty_sequence(self.header_list):
            raise EmptyHeaderError("header_list expected to have one or more header names")
Beispiel #17
0
    def _to_data_matrix(self):
        from collections import OrderedDict

        data_matrix = []

        for row_idx, row in enumerate(self._ltsv_input_stream):
            if typepy.is_empty_sequence(row):
                continue

            ltsv_record = OrderedDict()
            for col_idx, ltsv_item in enumerate(row.strip().split("\t")):
                try:
                    label, value = ltsv_item.split(":")
                except ValueError:
                    raise DataError(
                        "invalid lstv item found: line={}, col={}, item='{}'".
                        format(row_idx, col_idx, ltsv_item))

                label = label.strip('"')

                try:
                    pv.validate_ltsv_label(label)
                except (pv.NullNameError, pv.InvalidCharError):
                    raise InvalidHeaderNameError(
                        "invalid label found (acceptable chars are [0-9A-Za-z_.-]): "
                        "line={}, col={}, label='{}'".format(
                            row_idx, col_idx, label))

                ltsv_record[label] = value

            data_matrix.append(ltsv_record)

        # using generator to prepare for future enhancement to support
        # iterative load.
        yield data_matrix
Beispiel #18
0
    def to_table_data(self):
        if typepy.is_empty_sequence(self._loader.header_list):
            header_list = self._source_data[0]

            if any([typepy.is_null_string(header) for header in header_list]):
                raise InvalidDataError(
                    "the first line includes empty string item."
                    "all of the items should contain header name."
                    "actual={}".format(header_list))

            data_matrix = self._source_data[1:]
        else:
            header_list = self._loader.header_list
            data_matrix = self._source_data

        if not data_matrix:
            raise InvalidDataError(
                "data row must be greater or equal than one")

        self._loader.inc_table_count()

        yield TableData(self._loader.make_table_name(),
                        header_list,
                        data_matrix,
                        quoting_flags=self._loader.quoting_flags)
Beispiel #19
0
    def to_dp_list(self, values: Sequence) -> List[DataProperty]:
        if is_empty_sequence(values):
            return []

        self.__update_dp_converter()

        return self._to_dp_list(values)
Beispiel #20
0
    def write_table(self, **kwargs) -> None:
        """
        |write_table| with
        `Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__ format.
        Invalid characters in labels/data are removed.

        :Example:
            :ref:`example-ltsv-table-writer`
        """

        with self._logger:
            self._verify_property()
            self._preprocess()

            for values in self._table_value_matrix:
                ltsv_item_list = [
                    f"{pathvalidate.sanitize_ltsv_label(header_name):s}:{value}"
                    for header_name, value in zip(self.headers, values)
                    if typepy.is_not_null_string(value)
                ]

                if typepy.is_empty_sequence(ltsv_item_list):
                    continue

                self._write_line("\t".join(ltsv_item_list))
Beispiel #21
0
def file(ctx, files, format_name, encoding):
    """
    Convert tabular data within
    CSV/Excel/HTML/JSON/Jupyter Notebook/LDJSON/LTSV/Markdown/Mediawiki/SQLite/SSV/TSV
    file(s) to a SQLite database file.
    """

    if typepy.is_empty_sequence(files):
        sys.exit(ExitCode.NO_INPUT)

    logger = make_logger("{:s} file".format(PROGRAM_NAME), ctx.obj[Context.LOG_LEVEL])
    con, is_create_db = create_database(ctx.obj[Context.OUTPUT_PATH], ctx.obj[Context.DUP_DATABASE])
    converter = FileConverter(
        logger=logger,
        con=con,
        symbol_replace_value=ctx.obj[Context.SYMBOL_REPLACE_VALUE],
        index_list=ctx.obj.get(Context.INDEX_LIST),
        verbosity_level=ctx.obj.get(Context.VERBOSITY_LEVEL),
        format_name=format_name,
        encoding=encoding,
    )

    for file_path in files:
        converter.convert(file_path)

    sys.exit(finalize(con, converter, is_create_db))
Beispiel #22
0
    def _verify_property(self):
        self._verify_table_name()
        self._verify_stream()

        if all([
                typepy.is_empty_sequence(self.header_list),
                typepy.is_empty_sequence(self.value_matrix),
                typepy.is_empty_sequence(self._table_value_dp_matrix),
        ]):
            raise EmptyTableDataError()

        self._verify_header()
        try:
            self._verify_value_matrix()
        except EmptyValueError:
            pass
Beispiel #23
0
    def _to_dp_list(self, data_list, type_hint=None, strip_str=None, strict_level_map=None):
        if is_empty_sequence(data_list):
            return []

        type_counter = Counter()

        dp_list = []
        for data in data_list:
            expect_type_hint = type_hint
            if type_hint is None:
                try:
                    expect_type_hint, _count = type_counter.most_common(1)[0]
                    if not expect_type_hint(data, strict_level=StrictLevel.MAX).is_type():
                        expect_type_hint = None
                except IndexError:
                    pass

            dataprop = self.__to_dp(
                data=data,
                type_hint=expect_type_hint,
                strip_str=strip_str,
                strict_level_map=strict_level_map,
            )
            type_counter[dataprop.type_class] += 1

            dp_list.append(dataprop)

        return dp_list
Beispiel #24
0
    def as_tuple(self):
        """
        :return: Rows of the table.
        :rtype: list of |namedtuple|

        :Sample Code:
            .. code:: python

                from tabledata import TableData

                records = TableData(
                    "sample",
                    ["a", "b"],
                    [[1, 2], [3.3, 4.4]]
                ).as_tuple()
                for record in records:
                    print(record)

        :Output:
            .. code-block:: none

                Row(a=1, b=2)
                Row(a=Decimal('3.3'), b=Decimal('4.4'))
        """

        Row = namedtuple("Row", self.headers)

        for value_dp_list in self.value_dp_matrix:
            if typepy.is_empty_sequence(value_dp_list):
                continue

            row = Row(*[value_dp.data for value_dp in value_dp_list])

            yield row
Beispiel #25
0
    def __strip_empty_col(self):
        from simplesqlite import connect_sqlite_db_mem
        from simplesqlite.sqlquery import SqlQuery

        con = connect_sqlite_db_mem()

        tmp_table_name = "tmp"
        header_list = [
            "a{:d}".format(i) for i in range(len(self.__all_values[0]))
        ]
        con.create_table_from_data_matrix(table_name=tmp_table_name,
                                          attr_name_list=header_list,
                                          data_matrix=self.__all_values)
        for col_idx, header in enumerate(header_list):
            result = con.select(select=SqlQuery.to_attr_str(header),
                                table_name=tmp_table_name)
            if any([
                    typepy.is_not_null_string(record[0])
                    for record in result.fetchall()
            ]):
                break

        strip_header_list = header_list[col_idx:]
        if typepy.is_empty_sequence(strip_header_list):
            raise ValueError()

        result = con.select(select=",".join(
            SqlQuery.to_attr_str_list(strip_header_list)),
                            table_name=tmp_table_name)
        self.__all_values = result.fetchall()
    def to_dp_list(self, value_list):
        if is_empty_sequence(value_list):
            return []

        self.__update_dp_converter()

        return self._to_dp_list(value_list, strip_str=self.strip_str_value)
Beispiel #27
0
    def _write_row(self, value_list):
        if typepy.is_empty_sequence(value_list):
            return

        self._write_line(self.char_left_side_row +
                         self.column_delimiter.join(value_list) +
                         self.char_right_side_row)
Beispiel #28
0
    def is_empty_record(self):
        """
        :return: |True| if the data :py:attr:`.value_matrix` is empty.
        :rtype: bool
        """

        return typepy.is_empty_sequence(self.value_matrix)
Beispiel #29
0
    def to_table_data(self):
        if typepy.is_empty_sequence(self._loader.headers):
            headers = self._source_data[0]

            if any([typepy.is_null_string(header) for header in headers]):
                raise DataError("the first line includes empty string item."
                                "all of the items should contain header name."
                                "actual={}".format(headers))

            data_matrix = self._source_data[1:]
        else:
            headers = self._loader.headers
            data_matrix = self._source_data

        if not data_matrix:
            raise DataError("data row must be greater or equal than one")

        self._loader.inc_table_count()

        yield TableData(
            self._loader.make_table_name(),
            headers,
            data_matrix,
            dp_extractor=self._loader.dp_extractor,
            type_hints=self._extract_type_hints(headers),
        )
Beispiel #30
0
    def _validate_empty_header(self) -> None:
        """
        Raises:
            ValueError: If the |headers| is empty.
        """

        if typepy.is_empty_sequence(self.headers):
            raise ValueError("headers expected to have one or more header names")
Beispiel #31
0
    def _write_header(self) -> None:
        if not self.is_write_header:
            return

        if typepy.is_empty_sequence(self._table_headers):
            raise ValueError("header is empty")

        self._write_row(HEADER_ROW, self._table_headers)
Beispiel #32
0
    def _write_header(self):
        if not self.is_write_header:
            return

        if typepy.is_empty_sequence(self._table_header_list):
            raise EmptyHeaderError("header is empty")

        self._write_row(self._table_header_list)
Beispiel #33
0
def url(ctx, url, format_name, encoding, proxy):
    """
    Scrape tabular data from a URL and convert data to a SQLite database file.
    """

    if typepy.is_empty_sequence(url):
        sys.exit(ExitCode.NO_INPUT)

    initialize_log_handler(ctx.obj[Context.LOG_LEVEL])
    logger = make_logger("{:s} url".format(PROGRAM_NAME), ctx.obj[Context.LOG_LEVEL])

    try:
        app_configs = app_config_mgr.load()
    except ValueError as e:
        logger.debug(msgfy.to_debug_message(e))
        app_configs = {}

    if typepy.is_empty_sequence(encoding):
        encoding = app_configs.get(ConfigKey.DEFAULT_ENCODING)
        logger.debug("use default encoding: {}".format(encoding))

    if typepy.is_null_string(proxy):
        proxy = app_configs.get(ConfigKey.PROXY_SERVER)

    convert_configs = load_convert_config(logger, ctx.obj[Context.CONVERT_CONFIG], subcommand="url")

    con, is_create_db = create_database(ctx.obj[Context.OUTPUT_PATH], ctx.obj[Context.DUP_DATABASE])
    converter = UrlConverter(
        logger=logger,
        con=con,
        symbol_replace_value=ctx.obj[Context.SYMBOL_REPLACE_VALUE],
        add_pri_key_name=ctx.obj[Context.ADD_PRIMARY_KEY_NAME],
        convert_configs=convert_configs,
        index_list=ctx.obj.get(Context.INDEX_LIST),
        is_type_inference=ctx.obj[Context.TYPE_INFERENCE],
        is_type_hint_header=ctx.obj[Context.TYPE_HINT_HEADER],
        verbosity_level=ctx.obj.get(Context.VERBOSITY_LEVEL),
        format_name=format_name,
        encoding=encoding,
        proxy=proxy,
    )

    converter.convert(url)

    sys.exit(finalize(con, converter, is_create_db))
Beispiel #34
0
def url(ctx, url, format_name, output_path, encoding, proxy):
    """
    Scrape tabular data from a URL and convert data to a SQLite database file.
    """

    if typepy.is_empty_sequence(url):
        sys.exit(ExitCode.NO_INPUT)

    con = create_database(ctx, output_path)
    verbosity_level = ctx.obj.get(Context.VERBOSITY_LEVEL)
    schema_extractor = get_schema_extractor(con, verbosity_level)
    result_counter = ResultCounter()
    logger = make_logger("{:s} url".format(PROGRAM_NAME),
                         ctx.obj[Context.LOG_LEVEL])

    if typepy.is_null_string(proxy):
        proxy = app_config_manager.load().get(ConfigKey.PROXY_SERVER)

    proxies = {
        "http": proxy,
        "https": proxy,
    }

    try:
        loader = create_url_loader(logger, url, format_name, encoding, proxies)
    except ptr.LoaderNotFoundError as e:
        try:
            loader = create_url_loader(logger, url, "html", encoding, proxies)
        except ptr.LoaderNotFoundError as e:
            logger.error(e)
            sys.exit(ExitCode.FAILED_LOADER_NOT_FOUND)

    try:
        for tabledata in loader.load():
            sqlite_tabledata = ptr.SQLiteTableDataSanitizer(
                tabledata).sanitize()

            try:
                TableCreator(dst_con=con, tabledata=sqlite_tabledata).create()
                result_counter.inc_success()
            except (ValueError) as e:
                logger.debug(u"url={}, message={}".format(url, str(e)))
                result_counter.inc_fail()
                continue

            logger.info(
                get_success_message(
                    verbosity_level, url,
                    schema_extractor.get_table_schema_text(
                        sqlite_tabledata.table_name).strip()))
    except ptr.InvalidDataError as e:
        logger.error(u"invalid data: url={}, message={}".format(url, str(e)))
        result_counter.inc_fail()

    write_completion_message(logger, output_path, result_counter)

    sys.exit(result_counter.get_return_code())
Beispiel #35
0
    def __to_record_list(self, record_list):
        """
        Convert matrix to records
        """

        if typepy.is_empty_sequence(self.header_list):
            return record_list

        return [self.__to_record(record) for record in record_list]
Beispiel #36
0
    def __init__(self, table, attrs):
        validate_table_name(table)

        if not isinstance(attrs, AttrList):
            raise TypeError("attr must be a AttrList class instance: actual={}".format(type(attrs)))

        if typepy.is_empty_sequence(attrs):
            raise ValueError("empty attributes")

        self.__table = table
        self.__attrs = attrs
Beispiel #37
0
def file(ctx, files, recursive, pattern, exclude, follow_symlinks, format_name, encoding):
    """
    Convert tabular data within
    CSV/Excel/HTML/JSON/Jupyter Notebook/LDJSON/LTSV/Markdown/Mediawiki/SQLite/SSV/TSV
    file(s) or named pipes to a SQLite database file.
    """

    initialize_log_handler(ctx.obj[Context.LOG_LEVEL])
    logger = make_logger("{:s} file".format(PROGRAM_NAME), ctx.obj[Context.LOG_LEVEL])

    if typepy.is_empty_sequence(files):
        logger.error("require at least one file specification.\n\n{}".format(ctx.get_help()))
        sys.exit(ExitCode.NO_INPUT)

    convert_configs = load_convert_config(
        logger, ctx.obj[Context.CONVERT_CONFIG], subcommand="file"
    )

    con, is_create_db = create_database(ctx.obj[Context.OUTPUT_PATH], ctx.obj[Context.DUP_DATABASE])
    converter = FileConverter(
        logger=logger,
        con=con,
        symbol_replace_value=ctx.obj[Context.SYMBOL_REPLACE_VALUE],
        add_pri_key_name=ctx.obj[Context.ADD_PRIMARY_KEY_NAME],
        convert_configs=convert_configs,
        index_list=ctx.obj.get(Context.INDEX_LIST),
        is_type_inference=ctx.obj[Context.TYPE_INFERENCE],
        is_type_hint_header=ctx.obj[Context.TYPE_HINT_HEADER],
        verbosity_level=ctx.obj.get(Context.VERBOSITY_LEVEL),
        format_name=format_name,
        encoding=encoding,
        exclude_pattern=exclude,
        follow_symlinks=follow_symlinks,
    )

    for file_path in files:
        dir_path_obj = path.Path(file_path)

        if not follow_symlinks and dir_path_obj.islink() and dir_path_obj.isdir():
            logger.debug(
                "skip symlink to a directory: {} -> {}".format(
                    dir_path_obj, dir_path_obj.readlink()
                )
            )
            continue

        if recursive and dir_path_obj.isdir():
            for file_path_obj in dir_path_obj.walkfiles(pattern):
                converter.convert(file_path_obj)
        else:
            converter.convert(file_path)

    sys.exit(finalize(con, converter, is_create_db))
Beispiel #38
0
    def has_attr_list(self, table_name, attr_name_list):
        """
        :param str table_name: Table name that attributes exists.
        :param str attr_name_list: Attribute names to tested.
        :return: |True| if the table has all of the attribute.
        :rtype: bool
        :raises simplesqlite.TableNotFoundError:
            |raises_verify_table_existence|

        :Sample Code:
            .. code:: python

                import simplesqlite

                table_name = "sample_table"
                con = simplesqlite.SimpleSQLite("sample.sqlite", "w")
                con.create_table_from_data_matrix(
                    table_name=table_name,
                    attr_name_list=["attr_a", "attr_b"],
                    data_matrix=[[1, "a"], [2, "b"]])

                print(con.has_attr_list(table_name, ["attr_a"]))
                print(con.has_attr_list(table_name, ["attr_a", "attr_b"]))
                print(con.has_attr_list(
                    table_name, ["attr_a", "attr_b", "not_existing"]))
                try:
                    print(con.has_attr("not_existing", ["attr_a"]))
                except simplesqlite.TableNotFoundError as e:
                    print(e)
        :Output:
            .. parsed-literal::

                True
                True
                False
                'not_existing' table not found in /tmp/sample.sqlite
        """

        if typepy.is_empty_sequence(attr_name_list):
            return False

        not_exist_field_list = [
            attr_name for attr_name in attr_name_list
            if not self.has_attr(table_name, attr_name)
        ]

        if not_exist_field_list:
            return False

        return True
Beispiel #39
0
    def _write_header(self):
        if not self.is_write_header or typepy.is_empty_sequence(self.headers):
            return

        header_format_props = self.format_table.get(self.TableFormat.HEADER, self.default_format)
        header_format = self.__add_format(header_format_props)

        self.stream.write_row(
            row=self.first_header_row, col=0, data=self.headers, cell_format=header_format
        )
        for row in range(self.first_header_row, self.last_header_row):
            self.stream.write_row(
                row=row, col=0, data=[""] * len(self.headers), cell_format=header_format
            )
Beispiel #40
0
    def _write_header(self):
        tags = _get_tags_module()

        if not self.is_write_header:
            return

        if typepy.is_empty_sequence(self.headers):
            raise EmptyHeaderError("headers is empty")

        tr_tag = tags.tr()
        for header in self.headers:
            tr_tag += tags.th(MultiByteStrDecoder(header).unicode_str)

        thead_tag = tags.thead()
        thead_tag += tr_tag

        self._table_tag += thead_tag
Beispiel #41
0
    def insert_many(self, table_name, insert_record_list):
        """
        Send an INSERT query with multiple records to the database.

        :param str table: Table name of executing the query.
        :param insert_record: Records to be inserted.
        :type insert_record: |dict|/|namedtuple|/|list|/|tuple|
        :raises IOError: |raises_write_permission|
        :raises simplesqlite.NullDatabaseConnectionError:
            |raises_check_connection|
        :raises simplesqlite.TableNotFoundError:
            |raises_verify_table_existence|
        :raises simplesqlite.OperationalError: |raises_operational_error|

        :Example:
            :ref:`example-insert-records`

        .. seealso:: :py:meth:`.sqlquery.SqlQuery.make_insert`
        """

        self.validate_access_permission(["w", "a"])
        self.verify_table_existence(table_name)

        logger.debug("insert {} records".format(
            len(insert_record_list) if insert_record_list else 0))

        if typepy.is_empty_sequence(insert_record_list):
            return

        record_list = RecordConvertor.to_record_list(
            self.get_attr_name_list(table_name), insert_record_list)
        query = SqlQuery.make_insert(table_name, record_list[0])

        try:
            self.connection.executemany(query, record_list)
        except sqlite3.OperationalError as e:
            caller = logging.getLogger().findCaller()
            file_path, line_no, func_name = caller[:3]
            raise OperationalError(
                "{:s}({:d}) {:s}: failed to execute query:\n".format(
                    file_path, line_no, func_name) +
                "  query={}\n".format(query) +
                "  msg='{}'\n".format(str(e)) +
                "  db={}\n".format(self.database_path) +
                "  records={}\n".format(record_list[:2]))
Beispiel #42
0
    def _normalize_headers(self):
        if typepy.is_empty_sequence(self._tabledata.headers):
            try:
                return [
                    self.__get_default_header(col_idx)
                    for col_idx in range(len(self._tabledata.rows[0]))
                ]
            except IndexError:
                raise DataError("header list and data body are empty")

        attr_name_list = AttrList.sanitize(
            super(SQLiteTableDataSanitizer, self)._normalize_headers()
        )

        try:
            for attr_name in attr_name_list:
                validate_sqlite_attr_name(attr_name)
        except ReservedNameError:
            pass

        # duplicated attribute name handling ---
        for key, count in Counter(attr_name_list).most_common():
            if count <= 1:
                continue

            if self.__dup_col_handler == "error":
                raise ValueError("duplicate column name: {}".format(key))

            # rename duplicate headers
            rename_target_idx_list = [i for i, attr in enumerate(attr_name_list) if attr == key][1:]
            suffix_count = 0
            for rename_target_idx in rename_target_idx_list:
                while True:
                    suffix_count += 1
                    attr_name_candidate = "{:s}_{:d}".format(key, suffix_count)
                    if attr_name_candidate in attr_name_list:
                        continue

                    attr_name_list[rename_target_idx] = attr_name_candidate
                    break

        return attr_name_list
Beispiel #43
0
    def create_index_list(self, table_name, attr_name_list):
        """
        :param str table_name: Table name that exists attribute.
        :param list attr_name_list:
            List of attribute names to create indices.
            Ignore attributes that are not existing in the table.

        .. seealso:: :py:meth:`.create_index`
        """

        self.validate_access_permission(["w", "a"])

        if typepy.is_empty_sequence(attr_name_list):
            return

        table_attr_set = set(self.get_attr_name_list(table_name))
        index_attr_set = set(attr_name_list)

        for attribute in list(table_attr_set.intersection(index_attr_set)):
            self.create_index(table_name, attribute)
Beispiel #44
0
    def make_insert(cls, table, insert_tuple):
        """
        Make INSERT query.

        :param str table: Table name of executing the query.
        :param list/tuple insert_tuple: Insertion data.
        :return: Query of SQLite.
        :rtype: str
        :raises ValueError: If ``insert_tuple`` is empty |list|/|tuple|.
        :raises simplesqlite.InvalidTableNameError:
            |raises_validate_table_name|
        """

        validate_table_name(table)

        table = cls.to_table_str(table)

        if typepy.is_empty_sequence(insert_tuple):
            raise ValueError("empty insert list/tuple")

        return "INSERT INTO {:s} VALUES ({:s})".format(
            table, ",".join(['?' for _i in insert_tuple]))
Beispiel #45
0
    def __validate_attr_name_list(attr_name_list):
        if typepy.is_empty_sequence(attr_name_list):
            raise InvalidAttributeNameError("attribute name list is empty")

        for attr_name in attr_name_list:
            pathvalidate.validate_sqlite_attr_name(attr_name)
Beispiel #46
0
    def _write_header(self):
        if not self.is_write_header or typepy.is_empty_sequence(self.headers):
            return

        for col, value in enumerate(self.headers):
            self.stream.write(self.first_header_row, col, value)
Beispiel #47
0
    def _write_header(self):
        if typepy.is_empty_sequence(self.headers):
            return

        super(CsvTableWriter, self)._write_header()
 def test_normal(self, con):
     profile_list = con.get_profile()
     assert typepy.is_empty_sequence(profile_list)
Beispiel #49
0
 def __validate_stats_body(self, body_line_list):
     if typepy.is_empty_sequence(body_line_list):
         raise ParseError(reason=ParseErrorReason.EMPTY_STATISTICS)
Beispiel #50
0
    def _validate_headers(self):
        if typepy.is_empty_sequence(self._tabledata.headers):
            raise ValueError("attribute name list is empty")

        for header in self._tabledata.headers:
            self._validate_header(header)