Exemple #1
0
def _read_jsonlines_lazy(path: Union[str, Path]):
    """
    Lazily return the contents of a jsonlines file
    """
    parser = simdjson.Parser()
    with open(path) as f:
        for line in f:
            yield parser.parse(line, recursive=True)
Exemple #2
0
def _read_jsonlines_list(path: Union[str, Path]):
    """
    Read a jsonlines file into memory all at once
    """
    parser = simdjson.Parser()
    out = []
    with open(path) as f:
        for line in f:
            out.append(parser.parse(line, recursive=True))
    return out
Exemple #3
0
    def __init__(self, index: bytes):
        if hasattr(index, "read"):
            index = index.read()  # type:ignore

        try:
            import simdjson

            self._index = simdjson.Parser().parse(index)
        except ImportError:
            self._index = orjson.loads(index)
Exemple #4
0
def json(ds):
    import simdjson
    """parse each line in the file to a dictionary"""
    try:
        json_parser = simdjson.Parser()
        return json_parser.parse(ds)
    except ValueError:
        # fall back
        import json

        return json.loads(ds)
def perftest_pysimdjson_parser(jsonfile, number):
    import simdjson

    with open(jsonfile, 'rb') as f:
        jsonb = f.read()

    parser = simdjson.Parser()

    return benchmark("pysimdjson parse",
                     lambda: parser.parse(jsonb),
                     number=number)
Exemple #6
0
def test_implementation():
    """Ensure we can set the implementation."""
    parser = simdjson.Parser()
    # Ensure a rubbish implementation does not get set - simdjson does not do
    # a safety check, buy pysimdjson does. A break in this check will cause
    # a segfault.
    with pytest.raises(ValueError):
        parser.implementation = 'rubbish'

    # The generic, always-available implementation.
    parser.implementation = 'fallback'
    parser.parse('{"hello": "world"}')
    def get_bbox(self, label):
        """
    Given a label, compute an enclosing bounding box for it.

    Returns: Bbox in physical coordinates
    """
        locations = defaultdict(list)
        parser = simdjson.Parser()

        label = str(label)
        bbox = None

        if self.sql_db:
            conn = connect(self.sql_db)
            cur = conn.cursor()
            cur.execute(
                """
        select index_files.filename  
        from file_lookup, index_files
        where file_lookup.fid = index_files.id
          and file_lookup.label = ?
      """, (label, ))
            iterator = [
                self.fetch_index_files((row[0] for row in cur.fetchall()))
            ]
            conn.close()
        else:
            iterator = self.fetch_all_index_files()

        for index_files in iterator:
            for filename, content in index_files.items():
                segid_bbox_dict = parser.parse(content)
                filename = os.path.basename(filename)

                if label not in segid_bbox_dict:
                    continue

                current_bbox = Bbox.from_list(
                    np.frombuffer(
                        segid_bbox_dict[label].as_buffer(of_type="i"),
                        dtype=np.int64))

                if bbox is None:
                    bbox = current_bbox
                else:
                    bbox = Bbox.expand(bbox, current_bbox)

        return bbox
Exemple #8
0
def test_writer_substitutions():

    w = StreamWriter(
        dataset="TEST/{key}/{value}",
        inner_writer=NullWriter,
    )

    for record in DATA_SET:

        # convert to a simd object to test behavior
        as_json = orjson.dumps(record)
        parser = simdjson.Parser()
        as_simd = parser.parse(as_json)

        combinations = w.append(as_simd)
        assert combinations == as_simd["combinations"], combinations
Exemple #9
0
 def __init__(self,
              input_files,
              tokenizer,
              max_seq_len,
              target_field='text',
              seed=1,
              shuffle_files=True,
              **kwargs):
     super().__init__()
     self.files = []
     self.setup_files(input_files)
     if shuffle_files:
         random.seed(seed)
         random.shuffle(self.files)
     self.create_pipeline()
     self.tokenizer = tokenizer
     self.max_seq_len = max_seq_len
     self.target_field = target_field
     self.parser = json.Parser()
     self.idx = 0
Exemple #10
0
    def file_locations_per_label_json(self, labels, allow_missing=False):
        locations = defaultdict(list)
        parser = simdjson.Parser()
        if labels is not None:
            labels = set(toiter(labels))

        for index_files in self.fetch_all_index_files():
            for filename, content in index_files.items():
                index_labels = set(parser.parse(content).keys())
                filename = os.path.basename(filename)

                if labels is None:
                    for label in index_labels:
                        locations[int(label)].append(filename)
                elif len(labels) > len(index_labels):
                    for label in index_labels:
                        if int(label) in labels:
                            locations[int(label)].append(filename)
                else:
                    for label in labels:
                        if str(label) in index_labels:
                            locations[int(label)].append(filename)

        return locations
Exemple #11
0
 def json(ds):
     """parse each line in the file to a dictionary"""
     json_parser = simdjson.Parser()
     return json_parser.parse(ds)
Exemple #12
0
    start = time.perf_counter_ns()
    for r in reader:
        orjson.loads(r)
    return (time.perf_counter_ns() - start) / 1e9


import json
import ujson
import orjson

# import cysimdjson
import simdjson
import os
import sys

simparser = simdjson.Parser()


def sparser(o):
    return simparser.parse(o)


def simd_dump(o):
    o.mini


# sys.path.insert(1, os.path.join(sys.path[0], "../.."))

# print("cysimd parse:", time_it(test_parser, parser.parse))
print("pysimd parse:", time_it(test_parser, test_simd_serializer))
# print("json parse  :", time_it(test_parser, json.loads))
Exemple #13
0
def readl_simdjson(filepath: str):
    parser = simdjson.Parser()
    with open(filepath) as fp:
        return [simdjson.loads(line) for line in fp]
Exemple #14
0
    def query(self, bbox, allow_missing=False):
        """
    For the specified bounding box (or equivalent representation),
    list all segment ids enclosed within it.

    If allow_missing is set, then don't raise an error if an index
    file is missing.

    Returns: set(labels)
    """
        bbox = Bbox.create(bbox, context=self.physical_bounds, autocrop=True)
        original_bbox = bbox.clone()
        bbox = bbox.expand_to_chunk_size(self.chunk_size.astype(
            self.physical_bounds.dtype),
                                         offset=self.physical_bounds.minpt)

        if bbox.subvoxel():
            return []

        labels = set()
        fast_path = bbox.contains_bbox(self.physical_bounds)

        if self.sql_db and fast_path:
            conn = connect(self.sql_db)
            cur = conn.cursor()
            cur.execute("select label from file_lookup")
            while True:
                rows = cur.fetchmany(size=2**20)
                if len(rows) == 0:
                    break
                # Sqlite only stores signed integers, so we need to coerce negative
                # integers back into unsigned with a bitwise and.
                labels.update(
                    (int(row[0]) & 0xffffffffffffffff for row in rows))
            cur.close()
            conn.close()
            return labels

        index_files = self.index_file_paths_for_bbox(bbox)

        num_blocks = int(np.ceil(len(index_files) / 10000))
        for index_files_subset in tqdm(sip(index_files, 10000),
                                       total=num_blocks,
                                       desc="Block",
                                       disable=((not self.config.progress)
                                                or (num_blocks == 1))):
            results = self.fetch_index_files(index_files_subset)

            parser = simdjson.Parser()
            for filename, content in tqdm(results.items(),
                                          desc="Decoding Labels",
                                          disable=(not self.config.progress)):
                if content is None:
                    if allow_missing:
                        continue
                    else:
                        raise SpatialIndexGapError(filename +
                                                   " was not found.")

                # The bbox test saps performance a lot
                # but we can skip it if we know 100% that
                # the labels are going to be inside. This
                # optimization is important for querying
                # entire datasets, which is contemplated
                # for shard generation.
                if fast_path:
                    res = parser.parse(content).keys()
                    labels.update(
                        (int(label) for label in res))  # fast path: 16% CPU
                else:
                    res = simdjson.loads(content)
                    for label, label_bbx in res.items():
                        label = int(label)
                        label_bbx = Bbox.from_list(label_bbx)

                        if Bbox.intersects(label_bbx, original_bbox):
                            labels.add(label)

        return labels
Exemple #15
0
ds = tfds.load(name="ThePile", try_gcs=True)

# Have not tested below
ds.map(simple_tokenization, num_parallel_calls=tf.data.experimental.AUTOTUNE)
# or 
ds.map(lambda item: simple_tokenization(item), num_parallel_calls=tf.data.experimental.AUTOTUNE)

"""

try:
    import simdjson as json
except ImportError:
    print('Installing simdjson library')
    os.system('pip install -q pysimdjson')
    import simdjson as json
    parser = json.Parser()

_DESCRIPTION = """
The Pile is a large, diverse, open source language modelling data set 
that consists of many smaller datasets combined together. 
The objective is to obtain text from as many modalities as possible to 
ensure that models trained using The Pile will have much broader generalization abilities.
We are currently developing Version 1, with an ultimate goal of 1 TiB of English text. 
After the completion of Version 1, our next goal is a fully-multilingual, 10TiB text dataset.
"""

_CITATION = """
"""
_DATASET_MODES = ["lm"]

_PILE_URL = 'http://eaidata.bmk.sh/data/pile/train/{}.jsonl.zst'
Exemple #16
0
def without_buffer(content):
    import numpy

    parser = simdjson.Parser()
    doc = parser.parse(content)
    assert len(numpy.array(doc.as_list())) == 10001
Exemple #17
0
def with_buffer(content):
    import numpy

    parser = simdjson.Parser()
    doc = parser.parse(content)
    assert len(numpy.frombuffer(doc.as_buffer(of_type='d'))) == 10001
Exemple #18
0
        async def go():
            io = ConsoleIO()
            sampling_output = io.section().error_output
            percent_complete = 0
            sampling_output.write_line(
                f"<comment>Sampling:</comment> {percent_complete:3.0f}%")

            current_and_max_iterations_re = re.compile(
                r"Iteration:\s+(\d+)\s+/\s+(\d+)")
            async with stan.common.HttpstanClient() as client:
                operations = []
                for payload in payloads:
                    resp = await client.post(f"/{self.model_name}/fits",
                                             json=payload)
                    if resp.status == 422:
                        raise ValueError(str(resp.json()))
                    elif resp.status != 201:
                        raise RuntimeError(resp.json()["message"])
                    assert resp.status == 201
                    operations.append(resp.json())

                # poll to get progress for each chain until all chains finished
                current_iterations = {}
                while not all(operation["done"] for operation in operations):
                    for operation in operations:
                        if operation["done"]:
                            continue
                        resp = await client.get(f"/{operation['name']}")
                        assert resp.status != 404
                        operation.update(resp.json())
                        progress_message = operation["metadata"].get(
                            "progress")
                        if not progress_message:
                            continue
                        iteration, iteration_max = map(
                            int,
                            current_and_max_iterations_re.findall(
                                progress_message).pop(0))
                        current_iterations[operation["name"]] = iteration
                        iterations_count = sum(current_iterations.values())
                        total_iterations = iteration_max * num_chains
                        percent_complete = 100 * iterations_count / total_iterations
                        sampling_output.clear() if io.supports_ansi(
                        ) else sampling_output.write("\n")
                        sampling_output.write_line(
                            f"<comment>Sampling:</comment> {round(percent_complete):3.0f}% ({iterations_count}/{total_iterations})"
                        )
                    await asyncio.sleep(0.01)

                fit_in_cache = len(current_iterations) < num_chains

                stan_outputs = []
                for operation in operations:
                    fit_name = operation["result"].get("name")
                    if fit_name is None:  # operation["result"] is an error
                        assert not str(operation["result"]["code"]).startswith(
                            "2"), operation
                        message = operation["result"]["message"]
                        if """ValueError('Initialization failed.')""" in message:
                            sampling_output.clear()
                            sampling_output.write_line(
                                "<info>Sampling:</info> <error>Initialization failed.</error>"
                            )
                            raise RuntimeError("Initialization failed.")
                        raise RuntimeError(message)

                    resp = await client.get(f"/{fit_name}")
                    if resp.status != 200:
                        raise RuntimeError((resp.json())["message"])
                    stan_outputs.append(resp.content)

                    # clean up after ourselves when fit is uncacheable (no random seed)
                    if self.random_seed is None:
                        resp = await client.delete(f"/{fit_name}")
                        if resp.status not in {200, 202, 204}:
                            raise RuntimeError((resp.json())["message"])

                sampling_output.clear() if io.supports_ansi(
                ) else sampling_output.write("\n")
                sampling_output.write_line(
                    "<info>Sampling:</info> 100%, done." if fit_in_cache else
                    f"<info>Sampling:</info> {percent_complete:3.0f}% ({iterations_count}/{total_iterations}), done."
                )
                if not io.supports_ansi():
                    sampling_output.write("\n")

            stan_outputs = tuple(
                stan_outputs)  # Fit constructor expects a tuple.

            def is_nonempty_logger_message(msg: simdjson.Object):
                return msg["topic"] == "logger" and msg["values"][0] != "info:"

            def is_iteration_or_elapsed_time_logger_message(
                    msg: simdjson.Object):
                # Assumes `msg` is a message with topic `logger`.
                text = msg["values"][0]
                return (
                    text.startswith("info:Iteration:")
                    or text.startswith("info: Elapsed Time:")
                    # this detects lines following "Elapsed Time:", part of a multi-line Stan message
                    or text.startswith("info:" + " " * 15))

            parser = simdjson.Parser()
            nonstandard_logger_messages = []
            for stan_output in stan_outputs:
                for line in stan_output.splitlines():
                    # Do not attempt to parse non-logger messages. Draws could contain nan or inf values.
                    # simdjson cannot parse lines containing such values.
                    if b'"logger"' not in line:
                        continue
                    msg = parser.parse(line)
                    if is_nonempty_logger_message(
                            msg
                    ) and not is_iteration_or_elapsed_time_logger_message(msg):
                        nonstandard_logger_messages.append(msg.as_dict())
            del parser  # simdjson.Parser is no longer used at this point.

            if nonstandard_logger_messages:
                io.error_line(
                    "<comment>Messages received during sampling:</comment>")
                for msg in nonstandard_logger_messages:
                    text = msg["values"][0].replace("info:", "  ").replace(
                        "error:", "  ")
                    if text.strip():
                        io.error_line(f"{text}")

            fit = stan.fit.Fit(
                stan_outputs,
                num_chains,
                self.param_names,
                self.constrained_param_names,
                self.dims,
                num_warmup,
                num_samples,
                num_thin,
                save_warmup,
            )

            for entry_point in stan.plugins.get_plugins():
                Plugin = entry_point.load()
                fit = Plugin().on_post_sample(fit)
            return fit
    def to_sqlite(self,
                  database_name="spatial_index.db",
                  create_indices=True,
                  progress=None):
        """
    Create a sqlite database of labels and filenames
    from the JSON spatial_index for faster performance.

    Depending on the dataset size, this could take a while.
    With a dataset with ~140k index files, the DB took over
    an hour to build and was 42 GB.
    """
        progress = nvl(progress, self.config.progress)

        conn = sqlite3.connect(database_name)
        cur = conn.cursor()

        cur.execute("""
    CREATE TABLE index_files (
      id INTEGER PRIMARY KEY AUTOINCREMENT,
      filename TEXT NOT NULL
    )
    """)
        cur.execute("CREATE INDEX idxfname ON index_files (filename)")

        cur.execute("""
    CREATE TABLE file_lookup (
      label INTEGER NOT NULL,
      fid INTEGER NOT NULL REFERENCES index_files(id),
      PRIMARY KEY(label,fid)
    )
    """)

        cur.execute("PRAGMA journal_mode = MEMORY")
        cur.execute("PRAGMA synchronous = OFF")

        parser = simdjson.Parser()

        for index_files in self.fetch_all_index_files(progress=progress):
            for filename, content in index_files.items():
                index_labels = parser.parse(content).keys()
                filename = os.path.basename(filename)
                cur.execute("INSERT INTO index_files(filename) VALUES (?)",
                            (filename, ))
                cur.execute("SELECT id from index_files where filename = ?",
                            (filename, ))
                fid = cur.fetchone()[0]
                values = ((int(label), fid) for label in index_labels)
                cur.executemany(
                    "INSERT INTO file_lookup(label, fid) VALUES (?,?)", values)
            conn.commit()

        cur.execute("PRAGMA journal_mode = DELETE")
        cur.execute("PRAGMA synchronous = FULL")

        if create_indices:
            if progress:
                print("Creating labels index...")
            cur.execute("CREATE INDEX file_lbl ON file_lookup (label)")

            if progress:
                print("Creating filename index...")
            cur.execute("CREATE INDEX fname ON file_lookup (fid)")

        conn.close()
Exemple #20
0
    def __init__(
        self,
        stan_outputs: Tuple[bytes, ...],
        num_chains: int,
        param_names: Tuple[str, ...],
        constrained_param_names: Tuple[str, ...],
        dims: Tuple[Tuple[int, ...]],
        num_warmup: int,
        num_samples: int,
        num_thin: int,
        save_warmup: bool,
    ) -> None:
        self.stan_outputs = stan_outputs
        self.num_chains = num_chains
        assert self.num_chains == len(self.stan_outputs)
        self.param_names, self.dims, self.constrained_param_names = (
            param_names,
            dims,
            constrained_param_names,
        )
        self.num_warmup, self.num_samples = num_warmup, num_samples
        self.num_thin, self.save_warmup = num_thin, save_warmup

        # `self.sample_and_sampler_param_names` collects the sample and sampler param names.
        # - "sample params" include `lp__`, `accept_stat__`
        # - "sampler params" include `stepsize__`, `treedepth__`, ...
        # These names are gathered later in this function by inspecting the output from Stan.
        self.sample_and_sampler_param_names: Tuple[str, ...]

        num_flat_params = sum(np.product(
            dims_ or 1) for dims_ in dims)  # if dims == [] then it is a scalar
        assert num_flat_params == len(constrained_param_names)
        num_samples_saved = (self.num_samples + self.num_warmup *
                             self.save_warmup) // self.num_thin

        # self._draws holds all the draws. We cannot allocate it before looking at the draws
        # because we do not know how many sampler-specific parameters are present. Later in this
        # function we count them and only then allocate the array for `self._draws`.
        #
        # _draws is an ndarray with shape (num_sample_and_sampler_params + num_flat_params, num_draws, num_chains)
        self._draws: np.ndarray

        parser = simdjson.Parser()
        for chain_index, stan_output in zip(range(self.num_chains),
                                            self.stan_outputs):
            draw_index = 0
            for line in stan_output.splitlines():
                try:
                    msg = parser.parse(line)
                except ValueError:
                    # Occurs when draws contain an nan or infinity. simdjson cannot parse such values.
                    msg = json.loads(line)
                if msg["topic"] == "sample":
                    # Ignore sample message which is mixed together with proper draws.
                    if not isinstance(msg["values"], (simdjson.Object, dict)):
                        continue

                    # for the first draw: collect sample and sampler parameter names.
                    if not hasattr(self, "_draws"):
                        feature_names = cast(Tuple[str, ...],
                                             tuple(msg["values"].keys()))
                        self.sample_and_sampler_param_names = tuple(
                            name for name in feature_names
                            if name.endswith("__"))
                        num_rows = len(self.sample_and_sampler_param_names
                                       ) + num_flat_params
                        # column-major order ("F") aligns with how the draws are stored (in cols).
                        self._draws = np.empty(
                            (num_rows, num_samples_saved, num_chains),
                            order="F")
                        # rudimentary check of parameter order (sample & sampler params must be first)
                        if num_flat_params and feature_names[-1].endswith(
                                "__"):
                            raise RuntimeError(
                                f"Expected last parameter name to be one declared in program code, found `{feature_names[-1]}`"
                            )

                    draw_row = tuple(msg["values"].values(
                    ))  # a "row" of values from a single draw from Stan C++
                    self._draws[:, draw_index, chain_index] = draw_row
                    draw_index += 1
            assert draw_index == num_samples_saved
        assert self.sample_and_sampler_param_names and self._draws.size
        self._draws.flags["WRITEABLE"] = False
Exemple #21
0
        async def go():
            io = ConsoleIO()
            io.error_line("<info>Sampling...</info>")
            progress_bar = ProgressBar(io)
            progress_bar.set_format("very_verbose")

            current_and_max_iterations_re = re.compile(
                r"Iteration:\s+(\d+)\s+/\s+(\d+)")
            async with stan.common.HttpstanClient() as client:
                operations = []
                for payload in payloads:
                    resp = await client.post(f"/{self.model_name}/fits",
                                             json=payload)
                    if resp.status == 422:
                        raise ValueError(str(resp.json()))
                    elif resp.status != 201:
                        raise RuntimeError(resp.json()["message"])
                    assert resp.status == 201
                    operations.append(resp.json())

                # poll to get progress for each chain until all chains finished
                current_iterations = {}
                while not all(operation["done"] for operation in operations):
                    for operation in operations:
                        if operation["done"]:
                            continue
                        resp = await client.get(f"/{operation['name']}")
                        assert resp.status != 404
                        operation.update(resp.json())
                        progress_message = operation["metadata"].get(
                            "progress")
                        if not progress_message:
                            continue
                        iteration, iteration_max = map(
                            int,
                            current_and_max_iterations_re.findall(
                                progress_message).pop(0))
                        if not progress_bar.get_max_steps(
                        ):  # i.e., has not started
                            progress_bar.start(max=iteration_max * num_chains)
                        current_iterations[operation["name"]] = iteration
                        progress_bar.set_progress(
                            sum(current_iterations.values()))
                    await asyncio.sleep(0.01)
                # Sampling has finished. But we do not call `progress_bar.finish()` right
                # now. First we write informational messages to the screen, then we
                # redraw the (complete) progress bar. Only after that do we call `finish`.

                stan_outputs = []
                for operation in operations:
                    fit_name = operation["result"].get("name")
                    if fit_name is None:  # operation["result"] is an error
                        assert not str(operation["result"]["code"]).startswith(
                            "2"), operation
                        raise RuntimeError(operation["result"]["message"])
                    resp = await client.get(f"/{fit_name}")
                    if resp.status != 200:
                        raise RuntimeError((resp.json())["message"])
                    stan_outputs.append(resp.content)

                    # clean up after ourselves when fit is uncacheable (no random seed)
                    if self.random_seed is None:
                        resp = await client.delete(f"/{fit_name}")
                        if resp.status not in {200, 202, 204}:
                            raise RuntimeError((resp.json())["message"])

            stan_outputs = tuple(
                stan_outputs)  # Fit constructor expects a tuple.

            def is_nonempty_logger_message(msg: simdjson.Object):
                return msg["topic"] == "logger" and msg["values"][0] != "info:"

            def is_iteration_or_elapsed_time_logger_message(
                    msg: simdjson.Object):
                # Assumes `msg` is a message with topic `logger`.
                text = msg["values"][0]
                return (
                    text.startswith("info:Iteration:")
                    or text.startswith("info: Elapsed Time:")
                    # this detects lines following "Elapsed Time:", part of a multi-line Stan message
                    or text.startswith("info:" + " " * 15))

            parser = simdjson.Parser()
            nonstandard_logger_messages = []
            for stan_output in stan_outputs:
                for line in stan_output.splitlines():
                    # Do not attempt to parse non-logger messages. Draws could contain nan or inf values.
                    # simdjson cannot parse lines containing such values.
                    if b'"logger"' not in line:
                        continue
                    msg = parser.parse(line)
                    if is_nonempty_logger_message(
                            msg
                    ) and not is_iteration_or_elapsed_time_logger_message(msg):
                        nonstandard_logger_messages.append(msg.as_dict())
            del parser  # simdjson.Parser is no longer used at this point.

            progress_bar.clear()
            io.error("\x08" * progress_bar._last_messages_length
                     )  # move left to start of line
            if nonstandard_logger_messages:
                io.error_line(
                    "<comment>Messages received during sampling:</comment>")
                for msg in nonstandard_logger_messages:
                    text = msg["values"][0].replace("info:", "  ").replace(
                        "error:", "  ")
                    if text.strip():
                        io.error_line(f"{text}")
            progress_bar.display()  # re-draw the (complete) progress bar
            progress_bar.finish()
            io.error_line("\n<info>Done.</info>")

            fit = stan.fit.Fit(
                stan_outputs,
                num_chains,
                self.param_names,
                self.constrained_param_names,
                self.dims,
                num_warmup,
                num_samples,
                num_thin,
                save_warmup,
            )

            for entry_point in stan.plugins.get_plugins():
                Plugin = entry_point.load()
                fit = Plugin().on_post_fit(fit)
            return fit
Exemple #22
0
    def _to_sql_common(self,
                       conn,
                       cur,
                       create_indices,
                       progress,
                       mysql_syntax=False):
        # handle SQLite vs MySQL syntax quirks
        BIND = '%s' if mysql_syntax else '?'
        AUTOINC = "AUTO_INCREMENT" if mysql_syntax else "AUTOINCREMENT"
        INTEGER = "BIGINT UNSIGNED" if mysql_syntax else "INTEGER"

        progress = nvl(progress, self.config.progress)
        cur.execute("""DROP TABLE IF EXISTS index_files""")
        cur.execute("""DROP TABLE IF EXISTS file_lookup""")

        cur.execute(f"""
      CREATE TABLE index_files (
        id {INTEGER} PRIMARY KEY {AUTOINC},
        filename VARCHAR(100) NOT NULL
      )
    """)
        cur.execute("CREATE INDEX idxfname ON index_files (filename)")

        cur.execute(f"""
      CREATE TABLE file_lookup (
        label {INTEGER} NOT NULL,
        fid {INTEGER} NOT NULL REFERENCES index_files(id),
        PRIMARY KEY(label,fid)
      )
    """)

        parser = simdjson.Parser()

        for index_files in self.fetch_all_index_files(progress=progress):
            for filename, content in index_files.items():
                index_labels = parser.parse(content).keys()
                filename = os.path.basename(filename)
                cur.execute(
                    f"INSERT INTO index_files(filename) VALUES ({BIND})",
                    (filename, ))
                cur.execute(
                    f"SELECT id from index_files where filename = {BIND}",
                    (filename, ))
                fid = cur.fetchone()[0]
                values = ((int(label), fid) for label in index_labels)
                if mysql_syntax:
                    values = list(
                        values)  # doesn't support generators in v8.0.26
                cur.executemany(
                    f"INSERT INTO file_lookup(label, fid) VALUES ({BIND},{BIND})",
                    values)
            conn.commit()

        if create_indices:
            if progress:
                print("Creating labels index...")
            cur.execute("CREATE INDEX file_lbl ON file_lookup (label)")

            if progress:
                print("Creating filename index...")
            cur.execute("CREATE INDEX fname ON file_lookup (fid)")