Ejemplo n.º 1
0
def content_db(db_path: str, in_db_path: str,
               table: str='PreprocessedFiles') -> None:
    """
    Fetch kernels from a content database.

    Arguments:
        db_path (str): Output path.
        in_db_path (str): Input path.
        table (str, optional): Table to fetch from.
    """
    odb = dbutil.connect(db_path)
    idb = dbutil.connect(in_db_path)
    ic = idb.cursor()

    ic.execute('SELECT id,contents FROM {}'.format(table))
    rows = ic.fetchall()

    for id, contents in rows:
        kernels = clutil.get_cl_kernels(contents)
        ids = [clgen.checksum_str(kernel) for kernel in kernels]
        # print("{} kernels in {}".format(len(kernels), id))
        for kid, kernel in zip(ids, kernels):
            oc = odb.cursor()
            oc.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)',
                       (kid, kernel))
            odb.commit()
Ejemplo n.º 2
0
def graph_bc_lc(db_path: str) -> None:
    """
    Plot distribution of bytecode line counts.
    """
    import matplotlib.pyplot as plt
    import seaborn as sns
    sns.set(color_codes=True)

    out_path = fs.path(IMG_DIR, 'bc_lcs.png')
    print('graph', out_path, '...')
    db = dbutil.connect(db_path)
    c = db.cursor()

    c.execute("SELECT contents FROM Bytecodes")
    ocl = c.fetchall()
    ocl_lcs = [len(decode(x[0]).split('\n')) for x in ocl]

    # Filter range
    data = [x for x in ocl_lcs if x < 500]

    sns.distplot(data, bins=20, kde=False)
    plt.xlabel('Line count')
    plt.ylabel('Number of Bytecode files')
    plt.title('Distribution of Bytecode lengths')
    plt.savefig(out_path)
Ejemplo n.º 3
0
def process_cl_file(db_path: str, path: str) -> None:
    """
    Process OpenCL file.

    Arguments:
        db_path (str): Path to output database.
        path (str): Path to input file.

    Raises:
        FetchError: In case of IO error.
    """
    db = dbutil.connect(db_path)
    c = db.cursor()

    log.debug("fetch {path}".format(path=fs.abspath(path)))
    try:
        contents = inline_fs_headers(path, [])
    except IOError:
        raise FetchError(
            "cannot read file '{path}'".format(path=fs.abspath(path)))
    c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)',
              (path, contents))

    db.commit()
    c.close()
Ejemplo n.º 4
0
def fetch_fs(db_path: str, paths: list=[]) -> None:
    """
    Fetch from a list of files.

    Arguments:
        db_path (str): Output dataset.
        paths (str[]): List of file paths.
    """
    paths = clgen.files_from_list(paths)  # expand directories

    db = dbutil.connect(db_path)
    c = db.cursor()

    for path in paths:
        log.debug("fetch", path)
        try:
            contents = inline_fs_headers(path, [])
        except IOError:
            db.commit()
            raise FetchError(
                "cannot read file '{path}'".format(path=fs.abspath(path)))
        c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)',
                  (path, contents))

    db.commit()
Ejemplo n.º 5
0
def graph_ocl_stars(db_path: str) -> None:
    """
    Plot distribution of stargazers per file.
    """
    import matplotlib.pyplot as plt
    import seaborn as sns
    sns.set(color_codes=True)

    out_path = fs.path(IMG_DIR, '/ocl_stars.png')
    print('graph', out_path, '...')
    db = dbutil.connect(db_path)
    c = db.cursor()

    c.execute('SELECT stars FROM ContentMeta LEFT JOIN Repositories '
              'ON ContentMeta.repo_url=Repositories.url')
    stars = [x[0] for x in c.fetchall()]

    # Filter range
    data = [x for x in stars if x < 50]

    sns.distplot(data, bins=20, kde=False)
    plt.xlabel('GitHub Stargazer count')
    plt.ylabel('Number of files')
    plt.title('Stargazers per file')
    plt.savefig(out_path)
Ejemplo n.º 6
0
def print_bytecode_features(db_path: str) -> None:
    """
    Print Bytecode features.

    Arguments:
        db_path: Path to dataset.
    """
    db = dbutil.connect(db_path)
    c = db.cursor()

    c.execute('SELECT sha,contents FROM Bytecodes')
    query = c.fetchall()

    uniq_features = set()
    for row in query:
        sha, contents = row

        features = bytecode_features(contents)
        # Add the table key
        features['sha'] = sha
        for key in features.keys():
            uniq_features.add(key)

    log.info('Features:')
    for feature in uniq_features:
        log.info('        ', feature)
Ejemplo n.º 7
0
 def _static_features(kernels_db: str) -> None:
     log.verbose("Static feature encoding")
     db = dbutil.connect(kernels_db)
     c = db.cursor()
     c.execute("SELECT id,contents FROM PreprocessedFiles WHERE status=0")
     for row in list(c.fetchall()):
         id, contents = row
         c.execute("DELETE FROM PreprocessedFiles WHERE id=?", (id,))
         for i, kernel in enumerate(get_cl_kernels(contents)):
             features = get_kernel_features(kernel)
             kid = "{}-{}".format(id, i)
             if len(features) == 8:
                 log.verbose("features", kid)
                 feature_str = ("/* {:10} {:10} {:10} {:10} {:10} {:10}"
                                "{:10.3f} {:10.3f} */".format(
                                    int(features[0]),
                                    int(features[1]),
                                    int(features[2]),
                                    int(features[3]),
                                    int(features[4]),
                                    int(features[5]),
                                    features[6],
                                    features[7]))
                 newsource = feature_str + '\n' + kernel
                 c.execute("""
                     INSERT INTO PreprocessedFiles (id,contents,status)
                     VALUES (?,?,?)
                 """, (kid, newsource, 0))
             else:
                 log.verbose("ignored", kid)
     c.close()
     db.commit()
Ejemplo n.º 8
0
    def run(self) -> None:
        i = dbutil.num_rows_in(self.db_path, "ContentFiles")

        if not log.is_verbose():
            bar = progressbar.ProgressBar(max_value=self.max_i)
            bar.update(self.progress())

        try:
            while True:
                sample_time = time()
                sample = self.queue.get(timeout=60)

                kernels = clutil.get_cl_kernels(sample)
                ids = [crypto.sha1_str(k) for k in kernels]

                if self.sampler_opts["static_checker"]:
                    preprocess_opts = {
                        "use_shim": False,
                        "use_gpuverify": self.sampler_opts["gpuverify"]
                    }
                    pp = [clgen.preprocess_for_db(k, **preprocess_opts)
                          for k in kernels]

                db = dbutil.connect(self.db_path)
                c = db.cursor()

                # insert raw samples
                for kid, src in zip(ids, kernels):
                    dbutil.sql_insert_dict(c, "ContentFiles",
                                           {"id": kid, "contents": src},
                                           ignore_existing=True)

                # insert preprocessed samples
                if self.sampler_opts["static_checker"]:
                    for kid, (status, src) in zip(ids, pp):
                        dbutil.sql_insert_dict(c, "PreprocessedFiles", {
                            "id": kid, "status": status, "contents": src
                        }, ignore_existing=True)

                c.close()
                db.commit()
                db.close()

                # update progress bar
                progress = self.progress()
                if not log.is_verbose():
                    bar.update(progress)

                sample_time = time() - sample_time
                self.sampler.stats["progress"] = progress
                self.sampler.stats["time"] += sample_time
                self.sampler._flush_meta(self.cache)

                # determine if we are done sampling
                if self.term_condition():
                    self.producer.stop()
                    return
        finally:  # always kill the sampler thread
            print()
            self.producer.stop()
Ejemplo n.º 9
0
def remove_bad_preprocessed(db_path: str) -> None:
    """
    Remove all ugly and bad contents from PreprocessedFiles table.

    Arguments:
        db_path (str): Dataset.
    """
    original_size = fs.du(db_path, human_readable=False)
    original_size_human_readable = fs.du(db_path, human_readable=True)
    log.info("vacuuming", original_size_human_readable, "database")
    sys.stdout.flush()

    # Remove contents from bad or ugly preprocessed files.
    db = dbutil.connect(db_path)
    c = db.cursor()
    c.execute("UPDATE PreprocessedFiles SET contents='[DELETED]' "
              "WHERE status=1 OR status=2")
    db.commit()
    c.close()

    c = db.cursor()
    c.execute("VACUUM")
    db.commit()
    c.close()

    new_size = fs.du(db_path, human_readable=False)
    new_size_human_readable = fs.du(db_path, human_readable=True)
    reduction_ratio = (1 - (new_size / original_size)) * 100
    log.info("done. new size {}. ({:.0f}% reduction)".format(
        new_size_human_readable, reduction_ratio),
             sep=".")
Ejemplo n.º 10
0
    def test_remove_preprocessed(self):
        tmpdb = 'test_remove_preprocessed.db'
        fs.cp(tests.db_path('10-kernels-preprocessed'), tmpdb)

        self.assertEqual(8, dbutil.num_good_kernels(tmpdb))
        db = dbutil.connect(tmpdb)
        self.assertFalse(dbutil.is_modified(db))
        db.close()

        dbutil.remove_preprocessed(tmpdb)

        self.assertEqual(0, dbutil.num_good_kernels(tmpdb))

        db = dbutil.connect(tmpdb)
        self.assertTrue(dbutil.is_modified(db))
        db.close()

        fs.rm(tmpdb)
Ejemplo n.º 11
0
def process_sample_file(db_path: str, sample_path: str, first_only: bool=False,
                        max_kernel_len: int=5000, quiet: bool=False) -> None:
    """
    Fetch from a CLgen sample file.

    Arguments:
        db_path (str): Output path.
        sample_path (str): Sample path.
        first_only (bool, optional): If True, only fetch the first kernel in
            sample.
        ma_kernel_len (int, optional): Maximum kernel length.
    """
    db = dbutil.connect(db_path)
    c = db.cursor()

    with open(sample_path) as infile:
        sample = infile.read()

    i = 0
    tail = 0
    offset = len('__kernel void ')
    while True:
        if not quiet:
            print('\r\033[Kkernel', i, end='')
            sys.stdout.flush()

        # Find the starting index of the next kernel.
        tail = sample.find('__kernel void ', tail)

        # If we didn't find another kernel, stop.
        if tail == -1:
            break

        # Find the end index of this kernel.
        head = clutil.get_cl_kernel_end_idx(sample, start_idx=tail,
                                            max_len=max_kernel_len)

        # Look for other ends
        end = sample.find('__kernel void ',
                          tail + offset, tail + offset + max_kernel_len)
        head = min(end, head) if end != -1 else head

        kernel = sample[tail:head]
        id = clgen.checksum_str(kernel)
        c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)',
                  (id, kernel))
        tail = head
        i += 1
        if first_only:
            break
    if not quiet:
        print()
    db.commit()
    c.close()
Ejemplo n.º 12
0
    def _generate_kernel_corpus(self) -> str:
        """ dump all kernels into a string in a random order """
        db = dbutil.connect(self.contentcache["kernels.db"])
        c = db.cursor()

        # if preservering order, order by line count. Else, order randomly
        orderby = "LC(contents)" if self.opts["preserve_order"] else "RANDOM()"

        c.execute("SELECT PreprocessedFiles.Contents FROM PreprocessedFiles "
                  "WHERE status=0 ORDER BY {orderby}".format(orderby=orderby))

        return [row[0] for row in c.fetchall()]
Ejemplo n.º 13
0
    def contentfiles(self) -> Iterable[str]:
        """
        Return an iterator over all un-processed samples.

        Returns
        -------
        Iterable[str]
            Samples.
        """
        db = dbutil.connect(self.contentcache["kernels.db"])
        c = db.cursor()
        query = c.execute("SELECT Contents FROM ContentFiles")
        for row in query.fetchall():
            yield row[0]
Ejemplo n.º 14
0
    def _generate_kernel_corpus(self) -> str:
        """ dump all kernels into a string in a random order """
        db = dbutil.connect(self.contentcache["kernels.db"])
        c = db.cursor()

        # if preservering order, order by line count. Else, order randomly
        orderby = "LC(contents)" if self.opts["preserve_order"] else "RANDOM()"

        c.execute("SELECT PreprocessedFiles.Contents FROM PreprocessedFiles "
                  "WHERE status=0 ORDER BY {orderby}".format(orderby=orderby))

        # If file separators are requested, insert EOF markers between files
        sep = '\n\n// EOF\n\n' if self.opts["eof"] else '\n\n'

        return sep.join(row[0] for row in c.fetchall())
Ejemplo n.º 15
0
def preprocessed_kernels(corpus: Corpus) -> list:
    """
    Return an iterator over all preprocessed kernels.

    Arguments:
        corpus (Corpus): Corpus.

    Returns:
        sequence of str: Kernel sources.
    """
    assert (isinstance(corpus, Corpus))
    db = dbutil.connect(corpus.contentcache["kernels.db"])
    c = db.cursor()
    query = c.execute("SELECT Contents FROM PreprocessedFiles WHERE status=0")
    for row in query.fetchall():
        yield row[0]
Ejemplo n.º 16
0
    def run(self) -> None:
        i = dbutil.num_rows_in(self.db_path, "ContentFiles")

        if not log.is_verbose():
            bar = progressbar.ProgressBar(max_value=self.max_i)
            bar.update(self.progress())

        try:
            while True:
                sample_time = time()

                # Block while waiting for a new sample to come in:
                sample = self.queue.get(timeout=120).strip()

                # Compute the sample ID:
                kid = crypto.sha1_str(sample)

                # Add the new sample to the database:
                db = dbutil.connect(self.db_path)
                c = db.cursor()
                dbutil.sql_insert_dict(c,
                                       "ContentFiles", {
                                           "id": kid,
                                           "contents": sample
                                       },
                                       ignore_existing=True)
                c.close()
                db.commit()
                db.close()

                # update progress bar
                progress = self.progress()
                if not log.is_verbose():
                    bar.update(progress)

                sample_time = time() - sample_time
                self.sampler.stats["progress"] = progress
                self.sampler.stats["time"] += sample_time
                self.sampler._flush_meta(self.cache)

                # determine if we are done sampling
                if self.term_condition():
                    self.producer.stop()
                    return
        finally:  # always kill the sampler thread
            print()
            self.producer.stop()
Ejemplo n.º 17
0
def _preprocess_db_worker(job: dict) -> None:
    """Database worker thread"""
    db_path = job["db_in"]
    db_index_range = job["db_index_range"]
    outpath = job["json_out"]
    log.debug("worker", os.getpid(), outpath)

    db = dbutil.connect(db_path)
    c = db.cursor()
    split_start, split_end = db_index_range
    split_size = split_end - split_start

    # get the files to preprocess
    c.execute('SELECT id,contents FROM ContentFiles LIMIT {} OFFSET {}'.format(
        split_size, split_start))

    with open(outpath, 'wb') as outfile:
        for row in c.fetchall():
            id, contents = row

            # Get checksum of cached file:
            c.execute('SELECT id FROM PreprocessedFiles WHERE id=?', (id, ))
            result = c.fetchone()
            cached_id = result[0] if result else None

            # Check that file is modified:
            if id != cached_id:
                try:
                    # Try and preprocess it:
                    contents = preprocess(contents, id)
                    status = 0
                except BadCodeException as e:
                    contents = str(e)
                    status = 1
                except UglyCodeException as e:
                    contents = str(e)
                    status = 2

                # write result to json
                line = json.dumps([id, status, contents]).encode('utf-8')
                outfile.write(line)
                outfile.write('\n'.encode('utf-8'))

    c.close()
    db.close()
Ejemplo n.º 18
0
def stats_worker(db_path: str) -> list:
    """
    Generate dataset stats.
    """
    log.debug("stats worker ...")
    db = dbutil.connect(db_path)
    c = db.cursor()
    stats = []

    # ContentFiles
    c.execute("SELECT Count(DISTINCT id) from ContentFiles")
    nb_uniq_ocl_files = c.fetchone()[0]
    stats.append(('Number of content files', bigint(nb_uniq_ocl_files)))

    c.execute("SELECT contents FROM ContentFiles")
    code = c.fetchall()
    code_lcs = [len(x[0].split('\n')) for x in code]
    code_lcs.sort()
    code_lc = sum(code_lcs)
    stats.append(('Total content line count', bigint(code_lc)))

    stats.append(('Content file line counts', seq_stats(code_lcs)))
    stats.append(('', ''))

    # Preprocessed
    c.execute("SELECT Count(*) FROM PreprocessedFiles WHERE status=0")
    nb_pp_files = c.fetchone()[0]
    ratio_pp_files = div(nb_pp_files, nb_uniq_ocl_files)
    stats.append(
        ('Number of good preprocessed files',
         bigint(nb_pp_files) + ' ({:.0f}%)'.format(ratio_pp_files * 100)))

    c.execute('SELECT contents FROM PreprocessedFiles WHERE status=0')
    bc = c.fetchall()
    pp_lcs = [len(x[0].split('\n')) for x in bc]
    pp_lcs.sort()
    pp_lc = sum(pp_lcs)
    ratio_pp_lcs = div(pp_lc, code_lc)
    stats.append(('Lines of good preprocessed code',
                  bigint(pp_lc) + ' ({:.0f}%)'.format(ratio_pp_lcs * 100)))

    stats.append(('Good preprocessed line counts', seq_stats(pp_lcs)))
    stats.append(('', ''))

    return stats
Ejemplo n.º 19
0
def _scrape_github_for_files(db_path: str, github_username: str,
                             github_pw: str, github_token: str,
                             query_terms: List[str], file_is_intetesting,
                             download_file_cb):
    global errors_counter

    g = Github(github_username, github_pw)
    db = dbutil.connect(db_path)

    if not dbutil.is_github:
        raise clgen.UserError("not a GitHub database")

    # fetch the repositories to iterate over
    for query in query_terms:
        # forks are okay - we use checksums to ensure uniqueness in
        # final dataset
        repos = g.search_repositories(query + ' fork:true sort:stars')

        for repo in repos:
            # do nothing unless the repo is new or modified
            if not _process_repo(g, db, repo):
                continue

            # iterate over the entire git tree of the repo's default branch
            # (usually 'master'). If a file ends with the .cl extension, check
            # to see if we already have it, else download it
            try:
                branch = repo.default_branch
                tree_iterator = repo.get_git_tree(branch, recursive=True).tree
                for f in tree_iterator:
                    if file_is_intetesting(f):
                        try:
                            _process_file(g, github_token, db, repo, f,
                                          download_file_cb)
                        except Exception as e:
                            print(e)
                            sys.exit(1)
                            errors_counter += 1
            except GithubException:
                # do nothing in case of error (such as an empty repo)
                pass

    _print_counters()
    print("\n\ndone.")
    db.close()
Ejemplo n.º 20
0
def train(db_path: str, out_path: str, **kwargs) -> None:
    """
    Generate corpus.

    Arguments:
        db_path (str): Dataset.
        out_path (str): Corpus path.
        **kwargs (dict): Additional arguments to create_corpus().
    """
    db = dbutil.connect(db_path)
    db.create_function("LC", 1, linecount)

    # auto-detect whether it's a GitHub repo
    kwargs['gh'] = dbutil.is_github(db)

    ret = create_corpus(db, out_path, **kwargs)
    if ret:
        sys.exit(ret)
Ejemplo n.º 21
0
def explore(db_path: str, graph: bool = False) -> None:
    """
    Run exploratory analysis on dataset.

    Arguments:
        db_path (str): Path to dataset.
        graph (bool, optional): Render graphs.
    """
    locale.setlocale(locale.LC_ALL, 'en_GB.utf-8')

    db = dbutil.connect(db_path)

    if dbutil.is_github(db):
        db.close()
        explore_gh(db_path)
        return

    if graph and not os.path.exists(IMG_DIR):
        os.makedirs(IMG_DIR)

    # Worker process pool
    pool, jobs = Pool(processes=4), []
    if graph:
        jobs.append(pool.apply_async(graph_ocl_lc, (db_path, )))
        # TODO: If GH dataset:
        # jobs.append(pool.apply_async(graph_ocl_stars, (db_path,)))
    future_stats = pool.apply_async(stats_worker, (db_path, ))

    # Wait for jobs to finish
    [job.wait() for job in jobs]

    # Print stats
    print()
    stats = future_stats.get()
    maxlen = max([len(x[0]) for x in stats])
    for stat in stats:
        k, v = stat
        if k:
            print(k, ':', ' ' * (maxlen - len(k) + 2), v, sep='')
        elif v == '':
            print(k)
        else:
            print()
Ejemplo n.º 22
0
    def _finalize(db_path, cache):
        """Tidy up after worker threads finish"""
        log.debug("worker finalize")

        db = dbutil.connect(db_path)
        c = db.cursor()

        # import results from worker threads
        for outpath in fs.ls(cache.path, abspaths=True):
            with open(outpath) as infile:
                for line in infile:
                    c.execute(
                        'INSERT OR REPLACE INTO PreprocessedFiles '
                        'VALUES(?,?,?)', json.loads(line))

        # write changes to database and remove cache
        db.commit()
        db.close()
        cache.empty()
Ejemplo n.º 23
0
def preprocess_db(db_path: str) -> bool:
    """
    Preprocess database contents.

    Arguments:
        db_path (str): Path to database.

    Returns:
        bool: True if modified, false if no work needed.
    """
    db = dbutil.connect(db_path)

    modified = dbutil.is_modified(db)
    if modified:
        preprocess_contentfiles(db_path)
        dbutil.set_modified_status(db, modified)
        return True
    else:
        return False
Ejemplo n.º 24
0
def test_insert():
    db_path = tests.data_path("db", "tmp.db", exists=False)
    fs.rm(db_path)

    dbutil.create_db(db_path)
    db = dbutil.connect(db_path)
    c = db.cursor()

    assert dbutil.num_rows_in(db_path, "ContentFiles") == 0

    dbutil.sql_insert_dict(c, "ContentFiles", {"id": "a", "contents": "foo"})
    dbutil.sql_insert_dict(c, "PreprocessedFiles", {
        "id": "a",
        "status": 0,
        "contents": "bar"
    })
    dbutil.sql_insert_dict(c, "PreprocessedFiles", {
        "id": "b",
        "status": 1,
        "contents": "car"
    })

    db.commit()
    c = db.cursor()

    assert dbutil.num_rows_in(db_path, "ContentFiles") == 1
    assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 2

    assert dbutil.cc(db_path, "ContentFiles", "contents") == 3
    assert dbutil.cc(db_path, "ContentFiles", "id") == 1
    assert dbutil.lc(db_path, "ContentFiles", "contents") == 1

    dbutil.remove_bad_preprocessed(db_path)
    assert dbutil.num_rows_in(db_path, "ContentFiles") == 1
    # remove_bad_preprocessed doesn't actually delete any rows, just
    # replaces contents
    assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 2

    dbutil.remove_preprocessed(db_path)
    assert dbutil.num_rows_in(db_path, "ContentFiles") == 1
    assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 0
Ejemplo n.º 25
0
    def preprocessed(self, status: int=0) -> Iterable[str]:
        """
        Return an iterator over all preprocessed kernels.

        Parameters
        ----------
        status : int, optional
            Pre-processed status, {0, 1, 2} for {good, bad, ugly}.

        Returns
        -------
        Iterable[str]
            Sources.
        """
        db = dbutil.connect(self.contentcache["kernels.db"])
        c = db.cursor()
        query = c.execute(
            "SELECT Contents FROM PreprocessedFiles WHERE status={status}"
            .format(**vars()))
        for row in query.fetchall():
            yield row[0]
Ejemplo n.º 26
0
def merge(outpath, inpaths=[]):
    if not fs.isfile(outpath):
        dbutil.create_db(outpath)
        log.info("created", outpath)

    db = dbutil.connect(outpath)

    if not inpaths:
        inpaths = get_all_sampler_datasets()

    for inpath in inpaths:
        log.info("merging from", inpath)
        c = db.cursor()
        c.execute("ATTACH '{}' AS rhs".format(inpath))
        c.execute("INSERT OR IGNORE INTO ContentFiles "
                  "SELECT * FROM rhs.ContentFiles")
        c.execute("INSERT OR IGNORE INTO PreprocessedFiles "
                  "SELECT * FROM rhs.PreprocessedFiles")
        c.execute("DETACH rhs")
        db.commit()

    explore.explore(outpath)
Ejemplo n.º 27
0
def get_clsmith_program(db_path: str,
                        header_paths: list=[
                            "~/clsmith/runtime", "~/clsmith/build"]) -> None:
    """
    Generate a program using CLSmith and add to dataset.

    Arguments:
        db_path (str): Path to output dataset.
        header_paths (str[]): Directories containing CLSmith headers.
    """
    global files_new_counter

    outputpath = 'CLProg.c'

    db = dbutil.connect(db_path)
    c = db.cursor()

    # TODO: CLSmith might not be in path
    cmd = ["CLSmith"]

    process = Popen(cmd)
    process.communicate()

    if process.returncode != 0:
        raise CLSmithException()

    with open(outputpath) as infile:
        contents = infile.read()

    contents = inline_clsmith_headers(contents, header_paths)

    sha = sha1(contents.encode('utf-8')).hexdigest()

    c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)',
              (sha, contents))
    db.commit()
    db.close()
    files_new_counter += 1
    print_clsmith_counters()
Ejemplo n.º 28
0
def preprocess_db(db_path: str, **preprocess_opts) -> bool:
    """
    Preprocess database contents.

    Parameters
    ----------
    db_path : str
        Path to database.

    Returns
    -------
    bool
        True if modified, false if no work needed.
    """
    db = dbutil.connect(db_path)

    modified = dbutil.is_modified(db)
    if modified:
        _preprocess_db(db_path, **preprocess_opts)
        dbutil.set_modified_status(db, modified)
        return True
    else:
        return False
Ejemplo n.º 29
0
def clsmith(db_path: str, target_num_kernels: int) -> None:
    """
    Generate kernels using CLSmith.

    Arguments:
        db_path (str): Path to dataset.
        target_num_kernels (int): Number of kernels to generate.
    """
    global errors_counter

    print('generating', target_num_kernels, 'kernels to', db_path)

    db = dbutil.connect(db_path)
    c = db.cursor()
    c.execute('SELECT Count(*) FROM ContentFiles')
    num_kernels = c.fetchone()[0]
    while num_kernels < target_num_kernels:
        get_clsmith_program(db_path)
        c.execute('SELECT Count(*) FROM ContentFiles')
        num_kernels = c.fetchone()[0]

    print_counters()
    print("\n\ndone.")
    db.close()
Ejemplo n.º 30
0
def github(db_path: str, github_username: str, github_pw: str,
           github_token: str) -> None:
    """
    Download all of the OpenCL on GitHub (!)

    Shortcomings of this appraoch:
        * Only includes exclusively OpenCL files, no inline strings.
        * Occasionally (< 1%) can't find headers to include.

    Arguments:
        db_path (str): Dataset path.
        github_username (str): Authorization.
        github_pw (str): Authorization.
        github_token (str): Authorization.
    """
    global errors_counter

    g = Github(github_username, github_pw)
    db = dbutil.connect(db_path)

    if not dbutil.is_github:
        raise clgen.UserError("not a GitHub database")

    handle_repo = partial(process_repo, g, db)

    # fetch the repositories to iterate over. Since opencl isn't
    # treated as a first-class language by GitHub, we can't use the
    # 'language=' keyword for queries, so instead we through a much
    # wider net and filter the results afterwards.
    query_terms = [
        'opencl',
        'cl',
        'khronos',
        'gpu',
        'gpgpu',
        'cuda',
        'amd',
        'nvidia',
        'heterogeneous'
    ]
    for query in query_terms:
        # forks are okay - we use checksums to ensure uniqueness in
        # final dataset
        repos = g.search_repositories(query + ' fork:true sort:stars')

        for repo in repos:
            repo_modified = handle_repo(repo)

            # do nothing unless the repo is new or modified
            if not repo_modified:
                continue

            handle_file = partial(process_file, g, github_token, db, repo)

            # iterate over the entire git tree of the repo's default
            # branch (usually 'master'). If a file ends with the .cl
            # extension, check to see if we already have it, else download
            # it
            try:
                branch = repo.default_branch
                tree_iterator = repo.get_git_tree(branch, recursive=True).tree
                for f in tree_iterator:
                    try:
                        handle_file(f)
                    except Exception:
                        errors_counter += 1
            except GithubException:
                # do nothing in case of error (such as an empty repo)
                pass

    print_counters()
    print("\n\ndone.")
    db.close()