Exemple #1
0
def unpack_directory_if_needed(path: str) -> str:
    """
    If path is a tarball, unpack it. If path doesn't exist but there is a
    tarball with the same name, unpack it.

    Parameters
    ----------
    path : str
        Path to directory or tarball.

    Returns
    -------
    str
        Path to directory.

    Raises
    ------
    clgen.InternalError
        If unable to extract archive.
    """
    if fs.isdir(path):
        return path

    if fs.isfile(path) and path.endswith(".tar.bz2"):
        log.info("unpacking '{}'".format(path))
        tar.unpack_archive(path)
        return re.sub(r'.tar.bz2$', '', path)

    if fs.isfile(path + ".tar.bz2"):
        log.info("unpacking '{}'".format(path + ".tar.bz2"))
        tar.unpack_archive(path + ".tar.bz2")
        return path

    raise clgen.InternalError("cannot interpret archive '{path}'"
                              .format(**vars()))
Exemple #2
0
def print_bytecode_features(db_path: str) -> None:
    """
    Print Bytecode features.

    Arguments:
        db_path: Path to dataset.
    """
    db = dbutil.connect(db_path)
    c = db.cursor()

    c.execute('SELECT sha,contents FROM Bytecodes')
    query = c.fetchall()

    uniq_features = set()
    for row in query:
        sha, contents = row

        features = bytecode_features(contents)
        # Add the table key
        features['sha'] = sha
        for key in features.keys():
            uniq_features.add(key)

    log.info('Features:')
    for feature in uniq_features:
        log.info('        ', feature)
Exemple #3
0
def unpack_directory_if_needed(path: str) -> str:
    """
    If path is a tarball, unpack it. If path doesn't exist but there is a
    tarball with the same name, unpack it.

    Arguments:
        path (str): Path to directory or tarball.

    Returns:
        str: Path to directory.
    """
    if fs.isdir(path):
        return path

    if fs.isfile(path) and path.endswith(".tar.bz2"):
        log.info("unpacking '{}'".format(path))
        clgen.unpack_archive(path)
        return re.sub(r'.tar.bz2$', '', path)

    if fs.isfile(path + ".tar.bz2"):
        log.info("unpacking '{}'".format(path + ".tar.bz2"))
        clgen.unpack_archive(path + ".tar.bz2")
        return path

    return path
Exemple #4
0
def preprocess_inplace(paths: str,
                       max_num_workers: int = cpu_count(),
                       attempt: int = 1) -> None:
    """
    Preprocess a list of files in place.

    Arguments:
        paths (str[]): List of paths.
        max_num_workers (int, optional): Number of processes to spawn.
    """
    if attempt >= MAX_OS_RETRIES:
        raise clgen.InternalError("Failed to process files")

    num_workers = min(len(paths), max_num_workers)

    try:
        log.info('spawned', num_workers, 'worker threads to process',
                 len(paths), 'files ...')
        with clgen.terminating(Pool(num_workers)) as pool:
            pool.map(_preprocess_inplace_worker, paths)
    except OSError as e:
        log.error(e)

        # Try again with fewer threads.
        # See: https://github.com/ChrisCummins/clgen/issues/64
        max_num_workers = max(int(max_num_workers / 2), 1)
        preprocess_inplace(paths,
                           max_num_workers=max_num_workers,
                           attempt=attempt + 1)
Exemple #5
0
def remove_bad_preprocessed(db_path: str) -> None:
    """
    Remove all ugly and bad contents from PreprocessedFiles table.

    Parameters
    ----------
    db_path : str
        Dataset.
    """
    original_size = fs.du(db_path, human_readable=False)
    original_size_human_readable = fs.du(db_path, human_readable=True)
    log.info("vacuuming", original_size_human_readable, "database")
    sys.stdout.flush()

    # Remove contents from bad or ugly preprocessed files.
    db = connect(db_path)
    c = db.cursor()
    c.execute("UPDATE PreprocessedFiles SET contents='[DELETED]' "
              "WHERE status=1 OR status=2")
    db.commit()
    c.close()
    db.close()

    db = connect(db_path)
    c = db.cursor()
    c.execute("VACUUM")
    db.commit()
    c.close()

    new_size = fs.du(db_path, human_readable=False)
    new_size_human_readable = fs.du(db_path, human_readable=True)
    reduction_ratio = (1 - (new_size / original_size)) * 100
    log.info("done. new size {}. ({:.0f}% reduction)"
             .format(new_size_human_readable, reduction_ratio), sep=".")
Exemple #6
0
    def _main(infile: TextIO, vocab: str, size: bool) -> None:
        atoms = corpus.atomize(infile.read(), vocab=vocab)

        if size:
            log.info("size:", len(atoms))
        else:
            log.info('\n'.join(atoms))
Exemple #7
0
    def sample(self, model: clgen.Model) -> None:
        """
        Sample CLgen model.

        Parameters
        ----------
        model : clgen.Model
            CLgen model.
        """
        cache = self.cache(model)

        # create samples database if it doesn't exist
        if not cache.get("kernels.db"):
            tmp_kernels_db = cache.keypath("kernels.tmp.db")
            dbutil.create_db(tmp_kernels_db)
            cache["kernels.db"] = tmp_kernels_db

        # producer-consumer queue
        queue = Queue(maxsize=128)

        log.info("sampling", self)

        sampler = SampleProducer(model, self.start_text, queue,
                                 **self.kernel_opts)
        sampler.start()

        consumer = SampleConsumer(cache["kernels.db"], sampler, self, cache,
                                  queue, **self.sampler_opts)
        consumer.start()

        sampler.join()
        consumer.join()

        clgen.explore(cache["kernels.db"])
Exemple #8
0
def merge(outpath, inpaths=None):
    """
    Merge kernel datasets.
    """
    from clgen import explore

    if not fs.isfile(outpath):
        create_db(outpath)
        log.info("created", outpath)

    db = connect(outpath)

    if not inpaths:
        inpaths = get_all_sampler_datasets()

    for inpath in inpaths:
        log.info("merging from", inpath)
        c = db.cursor()
        c.execute("ATTACH '{}' AS rhs".format(inpath))
        c.execute("INSERT OR IGNORE INTO ContentFiles "
                  "SELECT * FROM rhs.ContentFiles")
        c.execute("INSERT OR IGNORE INTO PreprocessedFiles "
                  "SELECT * FROM rhs.PreprocessedFiles")
        db.commit()
        c.execute("DETACH rhs")
        c.close()

    explore.explore(outpath)
Exemple #9
0
 def _init_error(err: Exception, files_to_rm: List[str]=[]) -> None:
     """ tidy up in case of error """
     log.error("corpus creation failed. Deleting corpus files")
     for path in files_to_rm:
         if fs.exists(path):
             log.info("removing", path)
             fs.rm(path)
     raise err
Exemple #10
0
 def _init_error(err: Exception) -> None:
     """ tidy up in case of error """
     log.error("corpus creation failed. Deleting corpus files")
     paths = [
         fs.path(self.contentcache.path, "kernels.db"),
         fs.path(self.cache.path, "corpus.txt"),
         fs.path(self.cache.path, "tensor.npy"),
         fs.path(self.cache.path, "atomizer.pkl")
     ]
     for path in paths:
         if fs.exists(path):
             log.info("removing", path)
             fs.rm(path)
     raise err
Exemple #11
0
def preprocess_inplace(paths: List[str],
                       max_num_workers: int = cpu_count(),
                       max_attempts: int = 100,
                       attempt: int = 1) -> None:
    """
    Preprocess a list of files in place.

    Parameters
    ----------
    paths : List[str]
        List of paths.
    max_num_workers : int, optional
        Number of processes to spawn.
    max_attempts : int, optional
        In case of an OSError or TimeoutError, this number of attempts will be
        made.
    """
    if attempt > max_attempts:
        raise clgen.InternalError(
            f"Failed to process files after {max_attempts} attempts")
    elif attempt > 1:
        log.warning("preprocess attempt #.", attempt)

    num_workers = min(len(paths), max_num_workers)

    try:
        log.info('spawned', num_workers, 'worker threads to process',
                 len(paths), 'files ...')
        with clgen.terminating(Pool(num_workers)) as pool:
            pool.map(_preprocess_inplace_worker, paths)
    except (OSError, TimeoutError) as e:
        log.error(e)

        # Try again with fewer threads.
        # See: https://github.com/ChrisCummins/clgen/issues/64
        max_num_workers = max(int(max_num_workers / 2), 1)
        preprocess_inplace(paths,
                           max_num_workers=max_num_workers,
                           attempt=attempt + 1,
                           max_attempts=max_attempts)
Exemple #12
0
        def _main() -> None:
            cache = clgen.cachepath()

            log.warning("Not Implemented: refresh corpuses")

            if fs.isdir(cache, "model"):
                cached_modeldirs = fs.ls(fs.path(cache, "model"), abspaths=True)
                for cached_modeldir in cached_modeldirs:
                    cached_model_id = fs.basename(cached_modeldir)
                    cached_meta = jsonutil.read_file(fs.path(cached_modeldir, "META"))

                    model = clgen.Model.from_json(cached_meta)

                    if cached_model_id != model.hash:
                        log.info(cached_model_id, '->', model.hash)

                        if fs.isdir(model.cache.path):
                            log.fatal("cache conflict", file=sys.stderr)

                        fs.mv(cached_modeldir, model.cache.path)

            log.warning("Not Implemented: refresh samplers")
Exemple #13
0
    def sample(self, model: Model, quiet: bool = False) -> None:
        """
        Sample CLgen model.

        Arguments:
            model (Model): CLgen model.
        """
        cache = self.cache(model)

        # create samples database if it doesn't exist
        if not cache["kernels.db"]:
            dbutil.create_db(fs.path(cache.path, "kernels.tmp.db"))
            cache["kernels.db"] = fs.path(cache.path, "kernels.tmp.db")

        batch_i = 0
        while True:
            # stop if we have enough kernels
            has_max_kernels = self.max_kernels >= 0
            num_good_kernels = dbutil.num_good_kernels(cache["kernels.db"])
            if has_max_kernels and num_good_kernels >= self.max_kernels:
                return

            # stop if we've done enough batches
            has_max_batches = self.max_batches >= 0
            if has_max_batches and batch_i >= self.max_batches:
                return

            batch_i += 1
            print("sample batch", batch_i, "...")

            self.sample_iteration(model, quiet=quiet)

            print()
            explore(self.cache(model)["kernels.db"])

        log.info("samples database:", cache["kernels.db"])
Exemple #14
0
def preprocess_file(path: str, inplace: bool = False) -> None:
    """
    Preprocess a file.

    Prints output to stdout by default. If preprocessing fails, this function
    exits.

    Arguments:
        path (str): String path to file.
        inplace (bool, optional): If True, overwrite input file.
    """
    with open(path) as infile:
        contents = infile.read()
    try:
        out = preprocess(contents)
        if inplace:
            with open(path, 'w') as outfile:
                outfile.write(out)
        else:
            log.info('preprocess', out)
    except BadCodeException as e:
        log.fatal(e, ret=1)
    except UglyCodeException as e:
        log.fatal(e, ret=2)
Exemple #15
0
def _dump_db(db, out_path: str, gh: bool=False, fileid: bool=False,
             reverse: bool=False, input_samples: bool=False, status: int=0,
             eof: bool=False, dir: bool=False) -> None:
    """
    Dump database contents.

    Parameters
    ----------
    db : slite3.Connection
        Dataset.
    out_path : str
        Path to output.
    gh : bool, optional
        Dataset is GitHub.
    fileid : bool, optional
        Include file IDs.
    reverse : bool, optional
        Reverse ordering of output.
    input_samples : bool, optional
        If True, use un-preprocessed files.
    status : int, optional
        Filter preprocess status.
    eof : bool, optional
        Include EOF separators.
    dir : bool, optional
        Write output to directory.
    """
    log.info('writing corpus', out_path, '...')

    order = 'ASC' if reverse else 'DESC'

    c = db.cursor()

    # Query components
    table = 'ContentFiles' if input_samples else 'PreprocessedFiles'
    select = 'SELECT {}.id,{}.contents'.format(table, table, table)

    if input_samples:
        qualifier = ''
    else:
        qualifier = 'WHERE {}.status={}'.format(table, status)

    if gh:
        table += (' LEFT JOIN ContentMeta ON {}.id=ContentMeta.id'
                  ' LEFT JOIN Repositories ON '
                  'ContentMeta.repo_url=Repositories.url'
                  .format(table))
        orderby = 'Repositories.stars'
    else:
        orderby = 'LC_col(contents)'

    query = ('{select} FROM {table} {qualifier} ORDER BY {orderby} {order}'
             .format(select=select, table=table, qualifier=qualifier,
                     orderby=orderby, order=order))

    c.execute(query)
    rows = c.fetchall()

    if dir:
        log.info('writing to directory ', out_path, '/', sep='')
        if not os.path.exists(out_path):
            os.makedirs(out_path)
        for row in rows:
            id, contents = row
            path = os.path.join(out_path, kid_to_path(id) + '.cl')
            with open(path, 'w') as out:
                out.write(contents)
    else:
        log.info('writing file', out_path)
        with open(out_path, 'wb') as out:
            for row in rows:
                id, contents = row
                if fileid:  # Print file ID
                    out.write('/* ID: {} */\n\n'.format(id).encode('utf-8'))
                out.write(contents.encode('utf-8'))
                if eof:  # Print EOF token
                    out.write('\n/* EOF */\n\n'.encode('utf-8'))
                else:
                    out.write('\n\n'.encode('utf-8'))
Exemple #16
0
    def train(self, quiet: bool = False) -> None:
        """
        Train model.
        """
        tf = self._init_tensorflow(infer=False)

        # training options
        learning_rate = self.train_opts["learning_rate"]
        decay_rate = self.train_opts["lr_decary_rate"]
        checkpoint_path = fs.path(self.cache.path, "model.ckpt")

        # resume from prior checkpoint
        ckpt_path, ckpt_paths = None, None
        if self.checkpoint_path:
            # check if all necessary files exist
            assert (fs.isdir(self.checkpoint_path))
            ckpt = tf.train.get_checkpoint_state(self.checkpoint_path)
            assert (ckpt)
            assert (ckpt.model_checkpoint_path)
            ckpt_path, ckpt_paths = self._get_params_path(ckpt)

        with tf.Session() as sess:
            tf.global_variables_initializer().run()

            # keep all checkpoints
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)

            # restore model from closest checkpoint
            if ckpt_path:
                log.debug("restoring", ckpt_path)
                saver.restore(sess, ckpt_path)
                log.info("restored checkpoint {}".format(ckpt_path))

            # make sure we don't lose track of other checkpoints
            if ckpt_paths:
                saver.recover_last_checkpoints(ckpt_paths)

            start_batch = sess.run(self.epoch) * self.corpus.num_batches
            batch_count = 0
            total_elapsed = 0
            total_atomize = 0
            total_checkpoint, avg_checkpoint = 0, 0
            eta_d, eta_h, eta_m = 0, 0, 0

            for e in range(sess.run(self.epoch) + 1, self.epochs + 1):
                if quiet:
                    log.info("epoch", e, "of", self.epochs + 1)

                # decay and set learning rate
                new_learning_rate = learning_rate * (
                    (float(100 - decay_rate) / 100.0)**(e - 1))
                sess.run(tf.assign(self.learning_rate, new_learning_rate))
                sess.run(tf.assign(self.epoch, e))

                time_start = time.time()
                self.corpus.create_batches()
                total_atomize += time.time() - time_start
                avg_atomize = total_atomize / e

                state = sess.run(self.initial_state)
                for b in range(self.corpus.num_batches):
                    time_start = time.time()
                    batch_count += 1
                    x, y = self.corpus.next_batch()
                    feed = {self.input_data: x, self.targets: y}
                    for i, (c, h) in enumerate(self.initial_state):
                        feed[c] = state[i].c
                        feed[h] = state[i].h
                    train_loss, state, _ = sess.run(
                        [self.cost, self.final_state, self.train_op], feed)
                    batch_num = (e - 1) * self.corpus.num_batches + b
                    max_batch = self.epochs * self.corpus.num_batches

                    progress = float((batch_num + 1 - start_batch) /
                                     (max_batch - start_batch))

                    time_end = time.time()
                    elapsed = time_end - time_start

                    if not quiet:
                        total_elapsed += elapsed
                        avg_elapsed = total_elapsed / batch_count
                        remaining_time = (
                            (max_batch - batch_count) * avg_elapsed
                            +  # batches
                            (e - self.epochs) * avg_atomize +  # atomizings
                            (e - self.epochs) * avg_checkpoint)  # checkpoints
                        eta_h, eta_m = divmod(remaining_time / 60, 60)
                        eta_d, eta_h = divmod(eta_h, 24)

                        print("\r\033[K"
                              "{progress:3.1f}% | "
                              "{size}x{layers}x{max_epoch} {model} | "
                              "epoch={epoch_num}/{max_epoch} | "
                              "batch={batch_num}/{max_batch} | "
                              "lr={lr:.5f} | "
                              "loss={tloss:.3f} | "
                              "t1={time_atomize:.3f}s "
                              "t2={time_batch:.3f}s "
                              "t3={time_checkpoint:.3f}s | "
                              "eta={eta_d}d{eta_h}h{eta_m:02d}m".format(
                                  size=self.rnn_size,
                                  layers=self.num_layers,
                                  model=self.model_type.upper(),
                                  progress=progress * 100,
                                  epoch_num=e,
                                  max_epoch=self.epochs,
                                  batch_num=b + 1,
                                  max_batch=self.corpus.num_batches,
                                  lr=new_learning_rate,
                                  tloss=train_loss,
                                  time_atomize=avg_atomize,
                                  time_batch=avg_elapsed,
                                  time_checkpoint=avg_checkpoint,
                                  eta_d=int(eta_d),
                                  eta_h=int(eta_h),
                                  eta_m=int(eta_m)),
                              end="")

                save = self.opts["train_opts"]["intermediate_checkpoints"]
                save |= e == self.epochs  # last epoch
                if save:
                    if not quiet:
                        print()
                    time_start = time.time()
                    saver.save(sess, checkpoint_path, global_step=batch_num)
                    total_checkpoint += time.time() - time_start
                    avg_checkpoint = total_checkpoint / e
                    log.info("model saved to {}".format(checkpoint_path))
Exemple #17
0
 def _main(model_file: TextIO) -> None:
     model_json = jsonutil.loads(model_file.read())
     model = clgen.Model.from_json(model_json)
     model.train()
     log.info("done.")
Exemple #18
0
 def _main(db_file: BinaryIO, paths: List[Path]) -> None:
     clgen.fetch(db_file.name, paths)
     log.info("done.")
Exemple #19
0
    def _locked_train(self) -> 'Model':
        tf = self._init_tensorflow(infer=False)

        # training options
        learning_rate = self.train_opts["learning_rate"]
        decay_rate = self.train_opts["lr_decay_rate"]

        # resume from prior checkpoint
        ckpt_path, ckpt_paths = None, None
        if self.checkpoint_path:
            # check that all necessary files exist
            assert(fs.isdir(self.checkpoint_path))
            ckpt = tf.train.get_checkpoint_state(self.checkpoint_path)
            assert(ckpt)
            assert(ckpt.model_checkpoint_path)
            ckpt_path, ckpt_paths = self._get_params_path(ckpt)

        with tf.Session() as sess:
            tf.global_variables_initializer().run()

            # keep all checkpoints
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)

            # restore model from closest checkpoint
            if ckpt_path:
                log.debug("restoring", ckpt_path)
                saver.restore(sess, ckpt_path)
                log.verbose("restored checkpoint {}".format(ckpt_path))

            # make sure we don't lose track of other checkpoints
            if ckpt_paths:
                saver.recover_last_checkpoints(ckpt_paths)

            coord = tf.train.Coordinator()
            self.corpus.create_batches()
            threading.Thread(target=self.enqueue_x, args=(coord, sess)).start()

            max_batch = self.epochs * self.corpus.num_batches

            # progress bar
            bar = progressbar.ProgressBar(max_value=max_batch)

            if sess.run(self.epoch) != self.epochs:
                log.info("training", self)

            for e in range(sess.run(self.epoch) + 1, self.epochs + 1):
                epoch_start = time()

                # decay and set learning rate
                new_learning_rate = learning_rate * (
                    (float(100 - decay_rate) / 100.0) ** (e - 1))
                sess.run(tf.assign(self.learning_rate, new_learning_rate))
                sess.run(tf.assign(self.epoch, e))

                for b in range(self.corpus.num_batches):
                    train_cost, _, state, _ = sess.run([self.cost, self.KL_cost, self.final_state, self.train_op])
                    # update progress bar
                    batch_num = (e - 1) * self.corpus.num_batches + b
                    bar.update(batch_num)

                save = self.opts["train_opts"]["intermediate_checkpoints"]
                save |= e == self.epochs  # always save on last epoch
                if save:
                    saver.save(sess, self.cache.keypath("model.ckpt"),
                               global_step=batch_num)

                    next_checkpoint = e * self.corpus.num_batches + b
                    max_epoch = self.epochs
                    log.verbose("\n{self} epoch {e} / {max_epoch}. "
                                "next checkpoint at batch {next_checkpoint}"
                                .format(**vars()))

                    # update training time
                    epoch_duration = time() - epoch_start
                    self.stats["epoch_costs"].append(float(train_cost))
                    self.stats["epoch_times"].append(epoch_duration)
                    self.stats["epoch_batches"].append(batch_num + 1)
                    self._flush_meta()
            coord.request_stop()
        return self
Exemple #20
0
def preprocess_contentfiles(db_path: str,
                            max_num_workers: int = cpu_count(),
                            attempt: int = 1) -> None:
    """
    Preprocess OpenCL dataset.

    Arguments:
        db_path (str): OpenCL kernels dataset.
        max_num_workers (int, optional): Number of processes to spawn.
    """
    def _finalize(db_path, cache):
        """Tidy up after worker threads finish"""
        log.debug("worker finalize")

        db = dbutil.connect(db_path)
        c = db.cursor()

        # import results from worker threads
        for outpath in fs.ls(cache.path, abspaths=True):
            with open(outpath) as infile:
                for line in infile:
                    c.execute(
                        'INSERT OR REPLACE INTO PreprocessedFiles '
                        'VALUES(?,?,?)', json.loads(line))

        # write changes to database and remove cache
        db.commit()
        db.close()
        cache.empty()

    if attempt >= MAX_OS_RETRIES:
        raise clgen.InternalError("failed to preprocess files")

    num_contentfiles = dbutil.num_rows_in(db_path, 'ContentFiles')
    num_preprocessedfiles = dbutil.num_rows_in(db_path, 'PreprocessedFiles')
    log.info("{n} ({r:.1%}) files need preprocessing".format(
        n=num_contentfiles - num_preprocessedfiles,
        r=(num_contentfiles - num_preprocessedfiles) / num_contentfiles))

    # split into mulitple jobs of a maximum size
    jobsize = min(512, num_contentfiles)
    numjobs = math.ceil(num_contentfiles / jobsize)
    for j, offset in enumerate(range(0, num_contentfiles, jobsize)):
        num_preprocessedfiles = dbutil.num_rows_in(db_path,
                                                   'PreprocessedFiles')
        num_workers = min(num_contentfiles, max_num_workers)
        files_per_worker = math.ceil(jobsize / num_workers)

        # temporary cache used for worker thread results
        cache = Cache("{pid}.preprocess".format(pid=os.getpid()))
        # each worker thread receives a range of database indices to preprocess,
        # and a JSON file to write results into
        jobs = [{
            "db_in":
            db_path,
            "db_index_range":
            (offset + i * files_per_worker,
             offset + i * files_per_worker + files_per_worker),
            "json_out":
            fs.path(cache.path, "{i}.json".format(i=i))
        } for i in range(num_workers)]

        # spool up worker threads then finalize
        log.info('job {j} of {numjobs}: spawning {num_workers} worker threads '
                 'to process {jobsize} files ...'.format(**vars()))
        try:
            with clgen.terminating(Pool(num_workers)) as pool:
                pool.map(_preprocess_db_worker, jobs)
        except OSError as e:
            _finalize(db_path, cache)
            log.error(e)

            # Try again with fewer threads.
            # See: https://github.com/ChrisCummins/clgen/issues/64
            max_num_workers = max(int(max_num_workers / 2), 1)
            preprocess_contentfiles(db_path,
                                    max_num_workers=max_num_workers,
                                    attempt=attempt + 1)
        except Exception as e:
            _finalize(db_path, cache)
            raise e
        _finalize(db_path, cache)
Exemple #21
0
def _preprocess_db(db_path: str,
                   max_num_workers: int = cpu_count(),
                   max_attempts: int = 100,
                   attempt: int = 1,
                   **preprocess_opts) -> None:
    """
    Preprocess OpenCL dataset.

    Parameters
    ----------
    db_path : str
        OpenCL kernels dataset.
    max_num_workers : int, optional
        Number of processes to spawn.
    max_attempts : int, optional
        In case of an OSError or TimeoutError, this number of attempts will be
        made.
    """
    if attempt > max_attempts:
        raise clgen.InternalError(
            f"failed to preprocess files after {max_attempts} attempts")

    log.verbose("determining jobs")

    contentfiles = set(dbutil.kernel_ids(db_path, "ContentFiles"))
    preprocessedfiles = set(dbutil.kernel_ids(db_path, "PreprocessedFiles"))

    ncontentfiles = len(contentfiles)
    npreprocessedfiles = len(preprocessedfiles)

    todo = contentfiles - preprocessedfiles
    ntodo = len(todo)

    # check we have something to do
    if not ntodo:
        return

    todo_ratio = ntodo / ncontentfiles

    log.info("{ntodo} ({todo_ratio:.1%}) samples need preprocessing".format(
        **vars()))

    log.verbose("creating jobs")

    # Determine if we need to inline kernels when creating jobs
    db = sqlite3.connect(db_path)
    c = db.cursor()
    c.execute(
        "SELECT name FROM sqlite_master WHERE type='table' AND name='ContentMeta';"
    )
    meta_table = c.fetchone()
    c.close()
    db.close()
    if meta_table:
        get_kernel = lambda kid: dbutil.get_inlined_kernel(
            db_path, kid, lang=preprocess_opts["lang"])
    else:
        get_kernel = lambda kid: dbutil.get_kernel(
            db_path, kid, table="ContentFiles")

    # create jobs
    jobs = [{
        "id": kid,
        "src": get_kernel(kid),
        "preprocess_opts": preprocess_opts,
    } for kid in todo]

    random.shuffle(jobs)

    # split size
    worker_njobs = math.ceil(ntodo / max_num_workers)

    # producer-consumer queue
    queue = Queue(maxsize=128)

    log.verbose(f"assigning {ntodo} jobs to {max_num_workers} threads")

    try:
        # our worker threads. these busy little bees will do the heavy lifting
        # of preprocessing the contentfiles, pushing their results onto
        # the queue
        producers = [
            PreprocessWorker(jobs[i:i + worker_njobs], queue)
            for i in range(0, ntodo, worker_njobs)
        ]

        # fly, my pretties, fly!
        for producer in producers:
            producer.start()

        # consume the results from the worker threads from the main thread
        for i in progressbar.ProgressBar()(range(ntodo)):
            # pull a fresh result from the queue (block if necessary)
            try:
                result = queue.get(timeout=90)
            except QueueEmpty as e:
                raise TimeoutError('failed to fetch result after 90 seconds. '
                                   'something went wrong') from e

            # insert result into database
            db = dbutil.connect(db_path)
            c = db.cursor()
            c.execute("INSERT INTO PreprocessedFiles VALUES(?,?,?)",
                      (result["id"], result["status"], result["contents"]))
            c.close()
            db.commit()
            db.close()

        for producer in producers:
            producer.join()

    except (OSError, TimeoutError) as e:
        log.error(e)

        if attempt > 2 and not i:
            log.warning("no progress has been made since previous attempt. "
                        "I'm not going to try another attempt.")
            return

        # Try again with fewer threads.
        # See: https://github.com/ChrisCummins/clgen/issues/64
        max_num_workers = max(int(max_num_workers / 2), 1)
        _preprocess_db(db_path,
                       max_num_workers=max_num_workers,
                       attempt=attempt + 1,
                       max_attempts=max_attempts,
                       **preprocess_opts)
Exemple #22
0
def _preprocess_inplace_worker(path: str) -> None:
    """worker function for preprocess_inplace()"""
    log.info('preprocess', path)
    preprocess_file(path, inplace=True)