Beispiel #1
0
def train(db_path: str, out_path: str, **kwargs) -> None:
    """
    Generate corpus.

    Arguments:
        db_path (str): Dataset.
        out_path (str): Corpus path.
        **kwargs (dict): Additional arguments to create_corpus().
    """
    db = dbutil.connect(db_path)
    db.create_function("LC", 1, linecount)

    # auto-detect whether it's a GitHub repo
    kwargs['gh'] = dbutil.is_github(db)

    ret = create_corpus(db, out_path, **kwargs)
    if ret:
        sys.exit(ret)
Beispiel #2
0
def explore(db_path: str, graph: bool = False) -> None:
    """
    Run exploratory analysis on dataset.

    Arguments:
        db_path (str): Path to dataset.
        graph (bool, optional): Render graphs.
    """
    locale.setlocale(locale.LC_ALL, 'en_GB.utf-8')

    db = dbutil.connect(db_path)

    if dbutil.is_github(db):
        db.close()
        explore_gh(db_path)
        return

    if graph and not os.path.exists(IMG_DIR):
        os.makedirs(IMG_DIR)

    # Worker process pool
    pool, jobs = Pool(processes=4), []
    if graph:
        jobs.append(pool.apply_async(graph_ocl_lc, (db_path, )))
        # TODO: If GH dataset:
        # jobs.append(pool.apply_async(graph_ocl_stars, (db_path,)))
    future_stats = pool.apply_async(stats_worker, (db_path, ))

    # Wait for jobs to finish
    [job.wait() for job in jobs]

    # Print stats
    print()
    stats = future_stats.get()
    maxlen = max([len(x[0]) for x in stats])
    for stat in stats:
        k, v = stat
        if k:
            print(k, ':', ' ' * (maxlen - len(k) + 2), v, sep='')
        elif v == '':
            print(k)
        else:
            print()
Beispiel #3
0
def fetch_repos(db_path: Path, indir: Path, lang: clgen.Language) -> None:
    db = dbutil.connect(db_path)

    if not dbutil.is_github(db):
        raise clgen.UserError("not a GitHub database")

    c = db.cursor()

    for directory in fs.ls(indir, abspaths=True):
        # hacky hardcoded interpretation of `git remote -v`
        gitdir = fs.path(directory, ".git")
        output = subprocess.check_output(
            ["git", "--git-dir", gitdir, "remote", "-v"],
            universal_newlines=True)
        url = output.split("\n")[0].split("\t")[1].split(" ")[0]
        name = fs.basename(directory)

        output = subprocess.check_output(
            f"git --git-dir {gitdir} rev-list --format=format:'%ai' " +
            f"--max-count=1 $(git --git-dir {gitdir} rev-parse HEAD) | tail -n1",
            shell=True,
            universal_newlines=True)
        try:
            updated_at = dateutil.parser.parse(output)
        except ValueError:
            log.error(f"failed to process {name} {url}")
            continue

        c.execute("SELECT updated_at FROM Repositories WHERE url=?", (url, ))
        cached_updated_at = c.fetchone()

        # Do nothing unless updated timestamps don't match
        # if cached_updated_at and cached_updated_at[0] >= updated_at:
        #     log.verbose(name, "already in database")
        #     continue

        c.execute("DELETE FROM Repositories WHERE url=?", (url, ))
        c.execute("INSERT INTO Repositories VALUES(?,?,?,?,?,?,?,?,?)",
                  (url, "<unknown>", name, 0, 0, 0, 0, updated_at, updated_at))

        name_str = " -o ".join(
            [f"-name '*{ext}'" for ext in clgen.file_extensions(lang)])
        output = subprocess.check_output(
            f"find {directory} -type f {name_str} | grep -v '.git/' || true",
            shell=True,
            universal_newlines=True)
        files = [x.strip() for x in output.split("\n") if x.strip()]

        # nothing to import
        if not len(files):
            # log.verbose("no files in", name)
            continue

        log.verbose("processing", len(files), "files in", name)
        for path in files:
            relpath = path[len(directory) + 1:]
            try:
                contents = inline_fs_headers(path, [], lang=lang)
                sha = crypto.sha1_str(contents)
                c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)',
                          (sha, contents))
                c.execute(
                    "INSERT OR IGNORE INTO ContentMeta VALUES(?,?,?,?,?)",
                    (sha, relpath, url, sha, len(contents)))
            except UnicodeDecodeError:
                log.warning("non UTF-8 file", path)

        db.commit()
        c = db.cursor()
Beispiel #4
0
def explore(db_path: str) -> None:
    """
    Run exploratory analysis on dataset.

    Parameters
    ----------
    db_path : str
        Path to dataset.
    """
    locale.setlocale(locale.LC_ALL, 'en_GB.utf-8')

    db = dbutil.connect(db_path)

    if dbutil.is_github(db):
        db.close()
        explore_gh(db_path)
        return

    db = dbutil.connect(db_path)
    c = db.cursor()
    stats = []

    # ContentFiles
    c.execute("SELECT Count(DISTINCT id) from ContentFiles")
    nb_uniq_ocl_files = c.fetchone()[0]
    stats.append(('Number of content files', _bigint(nb_uniq_ocl_files)))

    c.execute("SELECT contents FROM ContentFiles")
    code = c.fetchall()
    code_lcs = [len(x[0].split('\n')) for x in code]
    code_lcs.sort()
    code_lc = sum(code_lcs)
    stats.append(('Total content line count', _bigint(code_lc)))

    stats.append(('Content file line counts', _seq_stats(code_lcs)))
    stats.append(('', ''))

    # Preprocessed
    c.execute("SELECT Count(*) FROM PreprocessedFiles")
    nb_pp_files = c.fetchone()[0]
    ratio_pp_files = _safe_div(nb_pp_files, nb_uniq_ocl_files)
    stats.append(
        ('Number of preprocessed files',
         _bigint(nb_pp_files) + ' ({:.0f}%)'.format(ratio_pp_files * 100)))

    c.execute("SELECT Count(*) FROM PreprocessedFiles WHERE status=0")
    nb_pp_files = c.fetchone()[0]
    ratio_pp_files = _safe_div(nb_pp_files, nb_uniq_ocl_files)
    stats.append(
        ('Number of good preprocessed files',
         _bigint(nb_pp_files) + ' ({:.0f}%)'.format(ratio_pp_files * 100)))

    c.execute('SELECT contents FROM PreprocessedFiles WHERE status=0')
    bc = c.fetchall()
    pp_lcs = [len(x[0].split('\n')) for x in bc]
    pp_lcs.sort()
    pp_lc = sum(pp_lcs)
    ratio_pp_lcs = _safe_div(pp_lc, code_lc)
    stats.append(('Lines of good preprocessed code',
                  _bigint(pp_lc) + ' ({:.0f}%)'.format(ratio_pp_lcs * 100)))

    stats.append(('Good preprocessed line counts', _seq_stats(pp_lcs)))
    stats.append(('', ''))

    # Print stats
    print()
    maxlen = max([len(x[0]) for x in stats])
    for stat in stats:
        k, v = stat
        if k:
            print(k, ':', ' ' * (maxlen - len(k) + 2), v, sep='')
        elif v == '':
            print(k)
        else:
            print()
Beispiel #5
0
 def test_is_github(self):
     self.assertFalse(dbutil.is_github(tests.db('empty')))
     self.assertTrue(dbutil.is_github(tests.db('empty-gh')))