Beispiel #1
0
    def meta(self) -> dict:
        """
        Get trained model metadata.

        Format spec: https://github.com/ChrisCummins/clgen/issues/25

        Returns:
            dict: Metadata.
        """
        # checksum corpus and model cache files. Paths are relative to cache
        # root.
        cache_root_re = r'^' + cache.ROOT + '/'
        corpus_files = dict(
            (re.sub(cache_root_re, "", x), clgen.checksum_file(x))
            for x in fs.ls(self.corpus.cache.path, abspaths=True))
        model_files = dict(
            (re.sub(cache_root_re, "", x), clgen.checksum_file(x))
            for x in fs.ls(self.cache.path, abspaths=True))

        contents = corpus_files.copy()
        contents.update(model_files)

        _meta = deepcopy(self.opts)
        _meta["version"] = clgen.version()
        _meta["date_packaged"] = labtime.nowstr()
        _meta["corpus"] = self.corpus.meta,
        _meta["contents"] = contents

        return _meta
Beispiel #2
0
def test_cli():
    fs.rm("kernels.db")
    cli.main("db init kernels.db".split())
    assert fs.exists("kernels.db")

    corpus_path = tests.archive("tiny", "corpus")
    cli.main("db explore kernels.db".split())
    cli.main(f"fetch fs kernels.db {corpus_path}".split())
    cli.main("preprocess kernels.db".split())
    cli.main("db explore kernels.db".split())

    fs.rm("kernels_out")
    cli.main("db dump kernels.db -d kernels_out".split())
    assert fs.isdir("kernels_out")
    assert len(fs.ls("kernels_out")) >= 1

    fs.rm("kernels.cl")
    cli.main("db dump kernels.db kernels.cl --file-sep --eof --reverse".split())
    assert fs.isfile("kernels.cl")

    fs.rm("kernels_out")
    cli.main("db dump kernels.db --input-samples -d kernels_out".split())
    assert fs.isdir("kernels_out")
    assert len(fs.ls("kernels_out")) == 250

    fs.rm("kernels.db")
    fs.rm("kernels_out")
Beispiel #3
0
def get_all_sampler_datasets():
    datasets = []
    sampledirs = []
    for versioncache in fs.ls(fs.path("~/.cache/clgen"), abspaths=True):
        samplerdir = fs.path(versioncache, "sampler")
        if fs.isdir(samplerdir):
            sampledirs += fs.ls(samplerdir, abspaths=True)

    for samplerdir in sampledirs:
        inpath = fs.path(samplerdir, "kernels.db")
        if fs.isfile(inpath):
            datasets.append(inpath)
    return datasets
Beispiel #4
0
def files_from_list(paths: list) -> list:
    """
    Return a list of all file paths from a list of files or directories.

    For each path in the input: if it is a file, return it; if it is a
    directory, return a list of files in the directory.

    Arguments:
        paths (list of str): List of file and directory paths.

    Returns:
        list of str: Absolute file paths.

    Raises:
        File404: If any of the paths do not exist.
    """
    ret = []
    for path in paths:
        if fs.isfile(path):
            ret.append(fs.abspath(path))
        elif fs.isdir(path):
            ret += [
                f for f in fs.ls(path, abspaths=True, recursive=True)
                if fs.isfile(f)
            ]
        else:
            raise File404(path)
    return ret
Beispiel #5
0
    def _main(infiles: List[TextIO], dir_mode: bool, summarise: bool,
              fatal_errors: bool, use_shum: bool, quiet: bool,
              no_header: bool) -> None:
        from clgen import features

        input_paths = [infile.name for infile in infiles]

        def features_dir(csv_path):
            return fs.basename(fs.dirname(csv_path))

        if summarise:
            stats = [features.summarize(f) for f in input_paths]

            print('dataset', *list(stats[0].keys()), sep=',')
            for path, stat in zip(input_paths, stats):
                print(features_dir(path), *list(stat.values()), sep=',')
            return

        if dir_mode:
            trees = [fs.ls(d, abspaths=True, recursive=True) for d in input_paths]
            paths = [item for sublist in trees for item in sublist]
        else:
            paths = [fs.path(f) for f in input_paths]

        features.files(paths, fatal_errors=fatal_errors, quiet=quiet,
                       use_shim=use_shim, header=not no_header)
Beispiel #6
0
def _shorthash(hash: str, cachedir: str, min_len: int = 7) -> str:
    """
    Truncate the hash to a shorter length, while maintaining uniqueness.

    This returns the shortest hash required to uniquely identify all elements
    in the cache.

    Parameters
    ----------
    hash : str
        Hash to truncate.
    cachedir : str
        Path to cache.
    min_len : int, optional
        Minimum length of hash to try.

    Returns
    -------
    str
        Truncated hash.
    """
    for shorthash_len in range(min_len, len(hash)):
        entries = [x[:shorthash_len] for x in fs.ls(cachedir)]
        if len(entries) == len(set(entries)):
            break

    return hash[:shorthash_len]
Beispiel #7
0
    def _create_kernels_db(self, path: str, encoding: str = "default") -> None:
        """creates and caches kernels.db"""
        log.debug("creating database")

        # create a database and put it in the cache
        tmppath = fs.path(self.contentcache.path, "kernels.db.tmp")
        dbutil.create_db(tmppath)
        self.contentcache["kernels.db"] = tmppath

        # get a list of files in the corpus
        filelist = [
            f for f in fs.ls(path, abspaths=True, recursive=True)
            if fs.isfile(f)
        ]

        # import files into database
        fetch.fetch_fs(self.contentcache["kernels.db"], filelist)

        # preprocess files
        preprocess.preprocess_db(self.contentcache["kernels.db"])

        # encode kernel db
        encode(self.contentcache["kernels.db"], encoding)

        # print database stats
        explore.explore(self.contentcache["kernels.db"])
Beispiel #8
0
def get_all_sampler_datasets(all_clgen_versions: bool=True) -> list:
    if all_clgen_versions:
        versiondirs = fs.ls(fs.path("~/.cache/clgen"), abspaths=True)
    else:
        versiondirs = [fs.path("~/.cache/clgen", clgen.version())]

    versiondirs = [v for v in versiondirs if fs.isdir(v, "sampler")]

    datasets = []
    for versiondir in versiondirs:
        for samplerdir in fs.ls(fs.path(versiondir, "sampler"), abspaths=True):
            inpath = fs.path(samplerdir, "kernels.db")
            if fs.isfile(inpath):
                datasets.append(inpath)

    return datasets
Beispiel #9
0
def main(model,
         sampler,
         print_file_list=False,
         print_corpus_dir=False,
         print_model_dir=False,
         print_sampler_dir=False) -> None:
    """
    Main entry point for clgen.

    Arguments:
        model (str): Path to model.
        sample (str): Path to sampler.
        print_corpus_dir (bool, optional): If True, print cache path and exit.
        print_model_dir (bool, optional): If True, print cache path and exit.
        print_sampler_dir (bool, optional): If True, print cache path and exit.
    """
    import clgen.model
    import clgen.sampler
    from clgen import log

    model_json = load_json_file(model)
    model = clgen.model.from_json(model_json)

    sampler_json = load_json_file(sampler)
    sampler = clgen.sampler.from_json(sampler_json)

    # print cache paths
    if print_file_list:
        files = sorted(
            fs.ls(model.corpus.cache.path, abspaths=True, recursive=True) +
            fs.ls(model.cache.path, abspaths=True, recursive=True) +
            fs.ls(sampler.cache(model).path, abspaths=True, recursive=True))
        print('\n'.join(files))
        sys.exit(0)
    elif print_corpus_dir:
        print(model.corpus.cache.path)
        sys.exit(0)
    elif print_model_dir:
        print(model.cache.path)
        sys.exit(0)
    elif print_sampler_dir:
        print(sampler.cache(model).path)
        sys.exit(0)

    model.train()
    sampler.sample(model)
Beispiel #10
0
    def __len__(self):
        """
    Get the number of entries in the cache.

    Returns:
        int: Number of entries in the cache.
    """
        return len(list(fs.ls(self.path)))
Beispiel #11
0
    def __len__(self):
        """
        Get the number of entries in the cache.

        Returns:
            int: Number of entries in the cache.
        """
        return len(list(fs.ls(self.path)))
Beispiel #12
0
    def __iter__(self):
        """
        Iterate over all cached files.

        Returns:
            iterable: Paths in cache.
        """
        for path in fs.ls(self.path, abspaths=True):
            yield path
Beispiel #13
0
    def __iter__(self):
        """
    Iterate over all cached files.

    Returns:
        iterable: Paths in cache.
    """
        for path in fs.ls(self.path, abspaths=True):
            yield path
Beispiel #14
0
def main():
    """
    Reduce all databases to oracle.
    """
    dbs = [migrate(_db.Database(path)) for path in
           fs.ls(experiment.DB_DEST, abspaths=True)
           if not re.search("oracle.db$", path)
           and re.search(".db$", path)]
    merge(fs.abspath(experiment.DB_DEST, "oracle.db"),
          dbs, experiment.ORACLE_PATH)
Beispiel #15
0
 def test_ls_abspaths(self):
     fs.cp("tests/data/testdir", "/tmp/testdir")
     self._test(["/tmp/testdir/a",
                 "/tmp/testdir/b",
                 "/tmp/testdir/c",
                 "/tmp/testdir/d"],
                fs.ls("/tmp/testdir", abspaths=True))
     self._test(["/tmp/testdir/a",
                 "/tmp/testdir/b",
                 "/tmp/testdir/c",
                 "/tmp/testdir/c/e",
                 "/tmp/testdir/c/f",
                 "/tmp/testdir/c/f/f",
                 "/tmp/testdir/c/f/f/i",
                 "/tmp/testdir/c/f/h",
                 "/tmp/testdir/c/g",
                 "/tmp/testdir/d"],
                fs.ls("/tmp/testdir", recursive=True, abspaths=True))
     fs.rm("/tmp/testdir")
Beispiel #16
0
    def ls(self, **kwargs):
        """
    List files in cache.

    Arguments:
        **kwargs: Keyword options to pass to fs.ls().

    Returns:
        iterable: List of files.
    """
        return fs.ls(self.path, **kwargs)
Beispiel #17
0
    def ls(self, **kwargs):
        """
        List files in cache.

        Arguments:
            **kwargs: Keyword options to pass to fs.ls().

        Returns:
            iterable: List of files.
        """
        return fs.ls(self.path, **kwargs)
Beispiel #18
0
def main():
    """
    Reduce all databases to oracle.
    """
    dbs = [
        migrate(_db.Database(path))
        for path in fs.ls(experiment.DB_DEST, abspaths=True)
        if not re.search("oracle.db$", path) and re.search(".db$", path)
    ]
    merge(fs.abspath(experiment.DB_DEST, "oracle.db"), dbs,
          experiment.ORACLE_PATH)
Beispiel #19
0
def test_ls_abspaths():
    fs.cp("labm8/data/test/testdir", "/tmp/testdir")
    assert fs.ls("/tmp/testdir", abspaths=True) == [
        "/tmp/testdir/a",
        "/tmp/testdir/b",
        "/tmp/testdir/c",
        "/tmp/testdir/d",
    ]
    assert fs.ls("/tmp/testdir", recursive=True, abspaths=True) == [
        "/tmp/testdir/a",
        "/tmp/testdir/b",
        "/tmp/testdir/c",
        "/tmp/testdir/c/e",
        "/tmp/testdir/c/f",
        "/tmp/testdir/c/f/f",
        "/tmp/testdir/c/f/f/i",
        "/tmp/testdir/c/f/h",
        "/tmp/testdir/c/g",
        "/tmp/testdir/d",
    ]
    fs.rm("/tmp/testdir")
Beispiel #20
0
def test_ls_recursive():
    assert fs.ls("labm8/data/test/testdir", recursive=True) == [
        "a",
        "b",
        "c",
        "c/e",
        "c/f",
        "c/f/f",
        "c/f/f/i",
        "c/f/h",
        "c/g",
        "d",
    ]
Beispiel #21
0
def models() -> Iterator[Model]:
    """
    Iterate over all cached models.

    Returns
    -------
    Iterator[Model]
        An iterable over all cached models.
    """
    if fs.isdir(clgen.cachepath(), "model"):
        modeldirs = fs.ls(fs.path(clgen.cachepath(), "model"), abspaths=True)
        for modeldir in modeldirs:
            meta = jsonutil.read_file(fs.path(modeldir, "META"))
            model = Model.from_json(meta)
            yield model
Beispiel #22
0
def _ExportProtos() -> None:
    proto_dir = pathlib.Path(FLAGS.proto_dir)

    assert proto_dir

    credentials = _GetMySqlCredentials()
    cnx = MySQLdb.connect(database='dsmith_04_opencl',
                          host='cc1',
                          user=credentials[0],
                          password=credentials[1])
    cursor = cnx.cursor()

    (proto_dir / 'testcases').mkdir(parents=True, exist_ok=True)
    (proto_dir / 'results').mkdir(parents=True, exist_ok=True)
    for program_id in FLAGS.program_ids:
        logging.info("Exporting OpenCL program %s", program_id)
        _ExportOpenCLResults(cursor, program_id, proto_dir)

    cursor.close()
    cnx.close()

    logging.info('Exported %d testcases and %d results',
                 len(fs.ls(proto_dir / 'testcases')),
                 len(fs.ls(proto_dir / 'results')))
Beispiel #23
0
    def _create_kernels_db(self, path: str) -> None:
        """creates and caches kernels.db"""
        log.debug("creating database")

        # create a database and put it in the cache
        tmppath = self.contentcache.keypath("kernels.db.tmp")
        dbutil.create_db(tmppath)
        self.contentcache["kernels.db"] = tmppath

        # get a list of files in the corpus
        filelist = [f for f in fs.ls(path, abspaths=True, recursive=True)
                    if fs.isfile(f)]

        # import files into database
        clgen.fetch(self.contentcache["kernels.db"], filelist)
Beispiel #24
0
    def _finalize(db_path, cache):
        """Tidy up after worker threads finish"""
        log.debug("worker finalize")

        db = dbutil.connect(db_path)
        c = db.cursor()

        # import results from worker threads
        for outpath in fs.ls(cache.path, abspaths=True):
            with open(outpath) as infile:
                for line in infile:
                    c.execute(
                        'INSERT OR REPLACE INTO PreprocessedFiles '
                        'VALUES(?,?,?)', json.loads(line))

        # write changes to database and remove cache
        db.commit()
        db.close()
        cache.empty()
Beispiel #25
0
def gather():
    benchmarks = {
        "canny": {},
        "fdtd": {},
        "gol": {},
        "gaussian": {},
        "heat": {},
        "simple": {},
        "simplecomplex": {}
    }

    for benchmark in benchmarks:
        io.info("Benchmark %s" % benchmark)
        fs.cd("/home/chris/src/msc-thesis/scraps/05-12/kernels/%s" % benchmark)

        instcounts = []
        for file in fs.ls():
            instcounts.append(get_instcount(file))

        benchmarks[benchmark] = merge_counts(instcounts)

    return benchmarks
def gather():
    benchmarks = {
        "canny": {},
        "fdtd": {},
        "gol": {},
        "gaussian": {},
        "heat": {},
        "simple": {},
        "simplecomplex": {}
    }

    for benchmark in benchmarks:
        io.info("Benchmark %s" % benchmark)
        fs.cd("/home/chris/src/msc-thesis/scraps/05-12/kernels/%s" % benchmark)

        instcounts = []
        for file in fs.ls():
            instcounts.append(get_instcount(file))

        benchmarks[benchmark] = merge_counts(instcounts)

    return benchmarks
Beispiel #27
0
        def _main() -> None:
            cache = clgen.cachepath()

            log.warning("Not Implemented: refresh corpuses")

            if fs.isdir(cache, "model"):
                cached_modeldirs = fs.ls(fs.path(cache, "model"), abspaths=True)
                for cached_modeldir in cached_modeldirs:
                    cached_model_id = fs.basename(cached_modeldir)
                    cached_meta = jsonutil.read_file(fs.path(cached_modeldir, "META"))

                    model = clgen.Model.from_json(cached_meta)

                    if cached_model_id != model.hash:
                        log.info(cached_model_id, '->', model.hash)

                        if fs.isdir(model.cache.path):
                            log.fatal("cache conflict", file=sys.stderr)

                        fs.mv(cached_modeldir, model.cache.path)

            log.warning("Not Implemented: refresh samplers")
Beispiel #28
0
def test_ls_empty_dir():
    fs.mkdir("/tmp/labm8.empty")
    assert not fs.ls("/tmp/labm8.empty")
    fs.rm("/tmp/labm8.empty")
Beispiel #29
0
def _dump_db(db,
             out_path: str,
             gh: bool = False,
             fileid: bool = False,
             reverse: bool = False,
             input_samples: bool = False,
             status: int = 0,
             eof: bool = False,
             dir: bool = False) -> None:
    """
    Dump database contents.

    Parameters
    ----------
    db : slite3.Connection
        Dataset.
    out_path : str
        Path to output.
    gh : bool, optional
        Dataset is GitHub.
    fileid : bool, optional
        Include file IDs.
    reverse : bool, optional
        Reverse ordering of output.
    input_samples : bool, optional
        If True, use un-preprocessed files.
    status : int, optional
        Filter preprocess status.
    eof : bool, optional
        Include EOF separators.
    dir : bool, optional
        Write output to directory.
    """
    log.info('writing corpus', out_path, '...')

    order = 'ASC' if reverse else 'DESC'

    c = db.cursor()

    # Query components
    table = 'ContentFiles' if input_samples else 'PreprocessedFiles'
    select = 'SELECT {}.id,{}.contents'.format(table, table, table)

    if input_samples:
        qualifier = ''
    else:
        qualifier = 'WHERE {}.status={}'.format(table, status)

    if gh:
        table += (' LEFT JOIN ContentMeta ON {}.id=ContentMeta.id'
                  ' LEFT JOIN Repositories ON '
                  'ContentMeta.repo_url=Repositories.url'.format(table))
        orderby = 'Repositories.stars'
    else:
        orderby = 'LC_col(contents)'

    query = (
        '{select} FROM {table} {qualifier} ORDER BY {orderby} {order}'.format(
            select=select,
            table=table,
            qualifier=qualifier,
            orderby=orderby,
            order=order))

    c.execute(query)
    rows = c.fetchall()

    if dir:
        log.info('writing to directory ', out_path, '/', sep='')
        if os.path.exists(out_path):
            if len(fs.ls(out_path)):
                raise clgen.UserError('directory already exists!')
        else:
            os.makedirs(out_path)
        for row in rows:
            id, contents = row
            path = os.path.join(out_path, kid_to_path(id) + '.cl')
            with open(path, 'w') as out:
                out.write(contents)
    else:
        log.info('writing file', out_path)
        with open(out_path, 'wb') as out:
            for row in rows:
                id, contents = row
                if fileid:  # Print file ID
                    out.write('/* ID: {} */\n\n'.format(id).encode('utf-8'))
                out.write(contents.encode('utf-8'))
                if eof:  # Print EOF token
                    out.write('\n/* EOF */\n\n'.encode('utf-8'))
                else:
                    out.write('\n\n'.encode('utf-8'))
Beispiel #30
0
 def test_ls_single_file(self):
     self._test(["a"], fs.ls("tests/data/testdir/a"))
Beispiel #31
0
 def test_ls_bad_path(self):
     with self.assertRaises(OSError):
         fs.ls("/not/a/real/path/bro")
Beispiel #32
0
 def test_ls_empty_dir(self):
     fs.mkdir("/tmp/labm8.empty")
     self._test([], fs.ls("/tmp/labm8.empty"))
     fs.rm("/tmp/labm8.empty")
Beispiel #33
0
 def test_ls_recursive(self):
     self._test(["a", "b", "c", "c/e", "c/f", "c/f/f",
                 "c/f/f/i", "c/f/h", "c/g", "d"],
                fs.ls("tests/data/testdir", recursive=True))
Beispiel #34
0
 def test_ls(self):
     self._test(["a", "b", "c", "d"],
                fs.ls("tests/data/testdir"))
Beispiel #35
0
def test_ls_single_file():
    assert ["a"] == fs.ls("labm8/data/test/testdir/a")
Beispiel #36
0
def test_ls_bad_path():
    with pytest.raises(OSError):
        fs.ls("/not/a/real/path/bro")
Beispiel #37
0
def host_has_opencl():
    try:
        return system.is_mac() or len(fs.ls('/etc/OpenCL/vendors'))
    except FileNotFoundError:
        return False
Beispiel #38
0
def create_corpus(db, out_path, gh=False, fileid=False, reverse=False,
                  input_samples=False, status=0, eof=False, dir=False):
    # Dump all the preprocessed OpenCL files
    print('creating DNN corpus', out_path, '...')

    order = 'ASC' if reverse else 'DESC'

    c = db.cursor()

    # Query components
    table = 'ContentFiles' if input_samples else 'PreprocessedFiles'
    select = 'SELECT {}.id,{}.contents'.format(table, table, table)

    if input_samples:
        qualifier = ''
    else:
        qualifier = 'WHERE {}.status={}'.format(table, status)

    if gh:
        table += (' LEFT JOIN ContentMeta ON {}.id=ContentMeta.id'
                  ' LEFT JOIN Repositories ON '
                  'ContentMeta.repo_url=Repositories.url'
                  .format(table))
        orderby = 'Repositories.stars'
    else:
        orderby = 'LC(contents)'

    query = ('{select} FROM {table} {qualifier} ORDER BY {orderby} {order}'
             .format(select=select, table=table, qualifier=qualifier,
                     orderby=orderby, order=order))

    c.execute(query)
    rows = c.fetchall()

    if dir:
        print('writing to directory ', out_path, '/', sep='')
        if os.path.exists(out_path):
            if len(fs.ls(out_path)):
                print('fatal: directory already exists!', file=sys.stderr)
                return 1
        else:
            os.makedirs(out_path)
        for row in rows:
            id,contents = row
            path = os.path.join(out_path, sanitize_id(id) + '.cl')
            with open(path, 'w') as out:
                out.write(contents)
        return 0
    else:
        print('writing file', out_path)
        with open(out_path, 'w') as out:
            for row in rows:
                id,contents = row
                if fileid:  # Print file ID
                    out.write('/* ID: {} */\n\n'.format(id))
                out.write(contents)
                if eof:  # Print EOF token
                    out.write('\n/* EOF */\n\n')
                else:
                    out.write('\n\n')
        return 0
Beispiel #39
0
    def import_from_dir(self, indir: Path) -> None:
        """ import program sources from a directory """
        with Session() as s:
            start_num_progs = self.num_programs(s)

            def _save(proxies):
                # Create records from proxies:
                programs = [proxy.to_record(s) for proxy in proxies]

                logging.warning(getattr(type(programs[0]), "sha1"))

                import sys
                sys.exit(0)

                # Filter duplicates in the set of new records:
                programs = dict(
                    (program.sha1, program) for program in programs).values()

                # Fetch a list of dupe keys already in the database:
                sha1s = [program.sha1 for program in programs]
                dupes = set(x[0] for x in s.query(Program.sha1).filter(
                    Program.sha1.in_(sha1s)))

                # Filter the list of records to import, excluding dupes:
                uniq = [
                    program for program in programs
                    if program.sha1 not in dupes
                ]

                # Import those suckas:
                s.add_all(uniq)
                s.commit()

                nprog, nuniq = len(programs), len(uniq)
                logging.info(f"imported {nuniq} of {nprog} unique programs")

            num_progs = self.num_programs(s)

            # Print a preamble message:
            paths = fs.ls(indir, abspaths=True)
            num_to_import = humanize.intcomma(len(paths))
            print(f"{Colors.BOLD}{num_to_import}{Colors.END} files are "
                  "to be imported.")

            bar = progressbar.ProgressBar(redirect_stdout=True)

            # The actual import loop:
            buf = []
            for i, path in enumerate(bar(paths)):
                buf.append(self.import_from_file(s, path))

                if len(buf) >= dsmith.DB_BUF_SIZE:
                    save_proxies_uniq_on(s, buf, "sha1")
                    buf = []
            save_proxies_uniq_on(s, buf, "sha1")

        num_imported = humanize.intcomma(
            self.num_programs(s) - start_num_progs)
        num_progs = humanize.intcomma(self.num_programs(s))
        print(f"All done! Imported {Colors.BOLD}{num_imported}{Colors.END} "
              f"new {self} programs. You now have "
              f"{Colors.BOLD}{num_progs}{Colors.END} {self} programs in the "
              "database")
Beispiel #40
0
def fetch_repos(db_path: Path, indir: Path, lang: clgen.Language) -> None:
    db = dbutil.connect(db_path)

    if not dbutil.is_github(db):
        raise clgen.UserError("not a GitHub database")

    c = db.cursor()

    for directory in fs.ls(indir, abspaths=True):
        # hacky hardcoded interpretation of `git remote -v`
        gitdir = fs.path(directory, ".git")
        output = subprocess.check_output(
            ["git", "--git-dir", gitdir, "remote", "-v"],
            universal_newlines=True)
        url = output.split("\n")[0].split("\t")[1].split(" ")[0]
        name = fs.basename(directory)

        output = subprocess.check_output(
            f"git --git-dir {gitdir} rev-list --format=format:'%ai' " +
            f"--max-count=1 $(git --git-dir {gitdir} rev-parse HEAD) | tail -n1",
            shell=True,
            universal_newlines=True)
        try:
            updated_at = dateutil.parser.parse(output)
        except ValueError:
            log.error(f"failed to process {name} {url}")
            continue

        c.execute("SELECT updated_at FROM Repositories WHERE url=?", (url, ))
        cached_updated_at = c.fetchone()

        # Do nothing unless updated timestamps don't match
        # if cached_updated_at and cached_updated_at[0] >= updated_at:
        #     log.verbose(name, "already in database")
        #     continue

        c.execute("DELETE FROM Repositories WHERE url=?", (url, ))
        c.execute("INSERT INTO Repositories VALUES(?,?,?,?,?,?,?,?,?)",
                  (url, "<unknown>", name, 0, 0, 0, 0, updated_at, updated_at))

        name_str = " -o ".join(
            [f"-name '*{ext}'" for ext in clgen.file_extensions(lang)])
        output = subprocess.check_output(
            f"find {directory} -type f {name_str} | grep -v '.git/' || true",
            shell=True,
            universal_newlines=True)
        files = [x.strip() for x in output.split("\n") if x.strip()]

        # nothing to import
        if not len(files):
            # log.verbose("no files in", name)
            continue

        log.verbose("processing", len(files), "files in", name)
        for path in files:
            relpath = path[len(directory) + 1:]
            try:
                contents = inline_fs_headers(path, [], lang=lang)
                sha = crypto.sha1_str(contents)
                c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)',
                          (sha, contents))
                c.execute(
                    "INSERT OR IGNORE INTO ContentMeta VALUES(?,?,?,?,?)",
                    (sha, relpath, url, sha, len(contents)))
            except UnicodeDecodeError:
                log.warning("non UTF-8 file", path)

        db.commit()
        c = db.cursor()
Beispiel #41
0
def create_corpus(db, out_path: str, gh: bool=False, fileid: bool=False,
                  reverse: bool=False, input_samples: bool=False, status: int=0,
                  eof: bool=False, dir: bool=False) -> None:
    """
    Create CLgen training corpus.

    Arguments:
        db (slite3.Connection): Dataset.
        out_path (str): Path to output.
        gh (bool, optional): Dataset is GitHub.
        fileid (bool, optional): Include file IDs.
        reverse (bool, optional): Reverse ordering of output.
        input_samples (bool, optional): If True, use un-preprocessed files.
        status (int, optional): Filter preprocess status.
        eof (bool, optional): Include EOF separators.
        dir (bool, optional): Write output to directory.
    """
    # Dump all the preprocessed OpenCL files
    print('creating DNN corpus', out_path, '...')

    order = 'ASC' if reverse else 'DESC'

    c = db.cursor()

    # Query components
    table = 'ContentFiles' if input_samples else 'PreprocessedFiles'
    select = 'SELECT {}.id,{}.contents'.format(table, table, table)

    if input_samples:
        qualifier = ''
    else:
        qualifier = 'WHERE {}.status={}'.format(table, status)

    if gh:
        table += (' LEFT JOIN ContentMeta ON {}.id=ContentMeta.id'
                  ' LEFT JOIN Repositories ON '
                  'ContentMeta.repo_url=Repositories.url'
                  .format(table))
        orderby = 'Repositories.stars'
    else:
        orderby = 'LC(contents)'

    query = ('{select} FROM {table} {qualifier} ORDER BY {orderby} {order}'
             .format(select=select, table=table, qualifier=qualifier,
                     orderby=orderby, order=order))

    c.execute(query)
    rows = c.fetchall()

    if dir:
        print('writing to directory ', out_path, '/', sep='')
        if os.path.exists(out_path):
            if len(fs.ls(out_path)):
                print('fatal: directory already exists!', file=sys.stderr)
                return 1
        else:
            os.makedirs(out_path)
        for row in rows:
            id, contents = row
            path = os.path.join(out_path, sanitize_id(id) + '.cl')
            with open(path, 'w') as out:
                out.write(contents)
        return 0
    else:
        print('writing file', out_path)
        with open(out_path, 'wb') as out:
            for row in rows:
                id, contents = row
                if fileid:  # Print file ID
                    out.write('/* ID: {} */\n\n'.format(id).encode('utf-8'))
                out.write(contents.encode('utf-8'))
                if eof:  # Print EOF token
                    out.write('\n/* EOF */\n\n'.encode('utf-8'))
                else:
                    out.write('\n\n'.encode('utf-8'))
        return 0
Beispiel #42
0
def test_ls():
    assert ["a", "b", "c", "d"] == fs.ls("labm8/data/test/testdir")