def install_pipfiles(cwd, names, env, out, err):
    for name in names:
        if not name:
            continue
        path = (cwd / name).parents[0]
        vprint(3, "Converting to requirements.txt: {}".format(path))
        requirements_txt = cwd.parents[0] / "requirements.txt"
        with open(str(requirements_txt), "wb") as outf:
            status, outdata, errdata = run_async_process(
                ". {}/etc/profile.d/conda.sh "
                "&& conda activate {} "
                "&& pipenv lock -r"
                .format(config.ANACONDA_PATH, env),
                outf, err, cwd=str(path)
            )
            data = b"##<>##\nOutput:\n" + outdata + b"\n##<>##Error:\n" + errdata
            if status != 0:
                return (False, data)
        result, data = install_requirements(
            requirements_txt.parents[0],
            ["requirements.txt"],
            env, out, err
        )
        if not result:
            return (False, data)
    return (True, b"")
Ejemplo n.º 2
0
def apply(session, status, skip_if_error, count, interval, reverse, check):
    """Extract markdown features"""
    filters = [
        Cell.processed.op('&')(consts.C_PROCESS_OK) == 0,
        Cell.processed.op('&')(skip_if_error) == 0,
        Cell.cell_type == 'markdown',
    ]
    if interval:
        filters += [
            Cell.repository_id >= interval[0],
            Cell.repository_id <= interval[1],
        ]

    query = (
        session.query(Cell)
        .filter(*filters)
    )

    if count:
        print(query.count())
        return

    if reverse:
        query = query.order_by(
            Cell.repository_id.desc(),
            Cell.notebook_id.asc(),
            Cell.index.asc(),
        )
    else:
        query = query.order_by(
            Cell.repository_id.asc(),
            Cell.notebook_id.asc(),
            Cell.index.asc(),
        )

    repository_id = None
    notebook_id = None
    for cell in query:
        if check_exit(check):
            vprint(0, 'Found .exit file. Exiting')
            return
        status.report()
        if repository_id != cell.repository_id:
            session.commit()
            repository_id = cell.repository_id
            vprint(0, 'Processing repository: {}'.format(repository_id))
        if notebook_id != cell.notebook_id:
            notebook_id = cell.notebook_id
            vprint(1, 'Processing notebook: {}'.format(notebook_id))
        vprint(2, 'Processing cell: {}/[{}]'.format(cell.id, cell.index))
        result = process_markdown_cell(
            session, repository_id, notebook_id, cell, skip_if_error
        )
        vprint(2, result)
        status.count += 1
    session.commit()
Ejemplo n.º 3
0
def load_repository(session, notebook, repository_id):
    if repository_id != notebook.repository_id:
        try:
            session.commit()
        except Exception as err:
            vprint(0, 'Failed to save modules from repository {} due to {}'.format(
                repository_id, err
            ))

        vprint(0, 'Processing repository: {}'.format(repository_id))
        return notebook.repository_id

    return repository_id
Ejemplo n.º 4
0
def load_repository(session, cell, skip_repo, repository_id, repository,
                    archives):
    if repository_id != cell.repository_id:
        repository = cell.repository_obj
        success, msg = session.commit()
        if not success:
            vprint(
                0, 'Failed to save cells from repository {} due to {}'.format(
                    repository, msg))

        vprint(0, 'Processing repository: {}'.format(repository))
        return False, cell.repository_id, repository, "todo"

    return skip_repo, repository_id, repository, archives
Ejemplo n.º 5
0
def pos_apply(dispatches, retry_errors, retry_timeout, verbose):
    """Dispatch execution to other python versions"""
    key = lambda x: x[1]
    dispatches = sorted(list(dispatches), key=key)
    for pyexec, disp in groupby(dispatches, key=key):
        vprint(0, "Dispatching to {}".format(pyexec))
        extra = []
        if retry_errors:
            extra.append("-e")
        if retry_timeout:
            extra.append("-t")
        extra.append("-n")

        notebook_ids = [x[0] for x in disp]
        while notebook_ids:
            ids = notebook_ids[:20000]
            args = extra + ids
            invoke(pyexec, "-u", __file__, "-v", verbose, *args)
            notebook_ids = notebook_ids[20000:]
def prepare_environment(
    session, env, mode, version, notebook_iter,
    mode_def, skip_env, notebook_exec_mode, dry_run, out, err
):
    vprint(0, "{}Preparing {} environment for Python {}".format(
        "[DRY RUN] " if dry_run >= 4 else "",
        'anaconda' if mode.anaconda else 'raw python', version
    ))
    if dry_run >= 4:
        return True

    if not skip_env and not install_env(env, out, err):
        for notebook, repository in notebook_iter:
            nmode = notebook_exec_mode(mode_def, notebook, repository)
            notebook.processed |= nmode.processed * 2
            session.add(notebook)
            session.commit()
        vprint(0, "Failed to prepare environment")
        return False
    return True
def install_requirements(cwd, names, env, out, err):
    for name in names:
        if not name:
            continue
        path = (cwd / name)
        vprint(3, "Installing {}".format(path))
        status, outdata, errdata = run_async_process(
            ". {}/etc/profile.d/conda.sh "
            "&& conda activate {} "
            "&& pip install -r '{}'"
            .format(
                config.ANACONDA_PATH, env,
                str(path).replace("'", "'\\''"),
            ),
            out, err
        )
        data = b"##<>##\nOutput:\n" + outdata + b"\n##<>##Error:\n" + errdata
        if status != 0:
            return (False, data)
    return (True, b"")
Ejemplo n.º 8
0
def load_notebook(session, cell, dispatches, repository, skip_repo,
                  skip_notebook, notebook_id, archives, checker):
    if notebook_id != cell.notebook_id:
        notebook_id = cell.notebook_id
        notebook = cell.notebook_obj
        if not notebook.compatible_version:
            pyexec = get_pyexec(notebook.py_version, config.VERSIONS)
            if sys.executable != pyexec:
                dispatches.add((notebook.id, pyexec))
                return skip_repo, True, cell.notebook_id, archives, None

        if archives == "todo":
            skip_repo, archives = load_archives(session, repository)
            if skip_repo:
                return skip_repo, skip_notebook, cell.notebook_id, archives, None
        if archives is None:
            return True, True, cell.notebook_id, archives, None

        vprint(1, 'Processing notebook: {}'.format(notebook))
        name = to_unicode(notebook.name)

        tarzip, repo_path = archives

        notebook_path = os.path.join(repo_path, name)
        try:
            if isinstance(tarzip, set):
                checker = SetLocalChecker(tarzip, notebook_path)
            elif tarzip:
                checker = CompressedLocalChecker(tarzip, notebook_path)
            else:
                checker = PathLocalChecker(notebook_path)
            if not checker.exists(notebook_path):
                raise Exception(
                    "Repository content problem. Notebook not found")
            return skip_repo, False, cell.notebook_id, archives, checker
        except Exception as err:
            vprint(
                2,
                "Failed to load notebook {} due to {}".format(notebook, err))
            return skip_repo, True, cell.notebook_id, archives, checker
    return skip_repo, skip_notebook, notebook_id, archives, checker
Ejemplo n.º 9
0
def collect_requirements(session, repository):
    if repository.path.exists():
        vprint(2, "using path")
        setups, requirements, pipfiles, pipfile_locks = find_files_in_path(
            repository.path,
            ["setup.py", "requirements.txt", "Pipfile", "Pipfile.lock"])
        changed = True
    elif repository.zip_path.exists():
        vprint(2, "using zip")
        with tarfile.open(str(repository.zip_path)) as tarzip:
            setups, requirements, pipfiles, pipfile_locks = find_files_in_zip(
                tarzip, Path(repository.hash_dir2),
                ["setup.py", "requirements.txt", "Pipfile", "Pipfile.lock"])
        changed = True
    else:
        vprint(2, "not found")
        repository.processed |= consts.R_UNAVAILABLE_FILES
        changed = False

    if changed:
        repository.setups_count = len(setups)
        repository.requirements_count = len(requirements)
        repository.pipfiles_count = len(pipfiles)
        repository.pipfile_locks_count = len(pipfile_locks)

        repository.setups = join_paths(setups)
        repository.requirements = join_paths(requirements)
        repository.pipfiles = join_paths(pipfiles)
        repository.pipfile_locks = join_paths(pipfile_locks)

    session.add(repository)
    session.commit()
Ejemplo n.º 10
0
def get_notebook(repository_id, notebook_id):
    with connect() as session:
        nbconvert_rdf = ''
        name = ''
        filters = [Repository.id == repository_id]
        repository = session.query(Repository).filter(*filters).first()

        notebook_filters = [
            Notebook.id == notebook_id, Notebook.repository_id == repository_id
        ]
        notebook_query = session.query(Notebook).filter(
            *notebook_filters).first()
        name = notebook_query.name
        with mount_basedir():
            if repository.path.exists():
                execution_path = (config.EXECUTION_DIR / repository.hash_dir2)
                if os.path.exists(execution_path):
                    notebook_path = execution_path
                else:
                    notebook_path = repository.path

            try:
                with open(str(notebook_path / name)) as ofile:
                    notebook = ofile.read()
                    nbtordfconverter = nb2rdf.NBToRDFConverter()
                    notebook_json = nbformat.reads(notebook, as_version=4)
                    nbconvert_rdf = nbtordfconverter.convert_to_rdf(
                        name, notebook_json)
                    output_file_extension = 'ttl'
                    output_file = os.path.join(
                        repository.path, name + "." + output_file_extension)
                    open(output_file, 'w').write(str(nbconvert_rdf))
                    return str(nbconvert_rdf), name
            except OSError as e:
                vprint(3, "Failed to open notebook {}".format(e))
                return str(nbconvert_rdf), name
Ejemplo n.º 11
0
def load_archives(session, repository):
    if not repository.processed & consts.R_EXTRACTED_FILES:
        if repository.zip_path.exists():
            vprint(1, 'Extracting files')
            result = process_repository(session, repository, skip_if_error=0)
            try:
                session.commit()
                if result != "done":
                    raise Exception("Extraction failure. Fallback")
                vprint(1, result)
            except Exception as err:
                vprint(1, 'Failed: {}'.format(err))
                try:
                    tarzip = tarfile.open(str(repository.zip_path))
                    if repository.processed & consts.R_COMPRESS_ERROR:
                        repository.processed -= consts.R_COMPRESS_ERROR
                    session.add(repository)
                except tarfile.ReadError:
                    repository.processed |= consts.R_COMPRESS_ERROR
                    session.add(repository)
                    return True, None
                zip_path = to_unicode(repository.hash_dir2)
                return False, (tarzip, zip_path)

        elif repository.path.exists():
            repo_path = to_unicode(repository.path)
            return False, (None, repo_path)
        else:
            repository.processed |= consts.R_UNAVAILABLE_FILES
            session.add(repository)
            vprint(1, "Failed to load repository. Skipping")
            return True, None

    tarzip = {
        fil.path
        for fil in session.query(RepositoryFile).filter(
            RepositoryFile.repository_id == repository.id)
    }
    zip_path = ""
    if tarzip:
        return False, (tarzip, zip_path)
    return True, None
Ejemplo n.º 12
0
def apply(
    session, status, skip_if_error,
    count, interval, reverse, check
):
    """Extract code cell features"""
    filters = [
        Notebook.processed.op("&")(consts.N_AGGREGATE_OK) == 0,
        Notebook.processed.op("&")(skip_if_error) == 0,
        Notebook.processed.op("&")(consts.N_GENERIC_LOAD_ERROR) == 0,
    ]
    if interval:
        filters += [
            Notebook.repository_id >= interval[0],
            Notebook.repository_id <= interval[1],
        ]

    query = (
        session.query(Notebook)
        .filter(*filters)
    )

    if count:
        print(query.count())
        return

    if reverse:
        query = query.order_by(
            Notebook.repository_id.desc(),
            Notebook.id.desc(),
        )
    else:
        query = query.order_by(
            Notebook.repository_id.asc(),
            Notebook.id.asc(),
        )

    repository_id = None

    for notebook in query:
        if check_exit(check):
            session.commit()
            vprint(0, 'Found .exit file. Exiting')
            return
        status.report()

        repository_id = load_repository(session, notebook, repository_id)

        vprint(1, 'Processing notebook: {}'.format(notebook))
        result = process_notebook(session, notebook, skip_if_error)
        vprint(1, result)
        status.count += 1
    session.commit()
Ejemplo n.º 13
0
def apply(
    session, status,
    skip_if_error,
    count, interval, reverse, check
):
    """Extract code cell features"""
    filters = [
        Repository.processed.op('&')(consts.R_EXTRACTED_FILES) == 0,
        Repository.processed.op('&')(skip_if_error) == 0,
        Repository.processed.op('&')(consts.R_COMPRESS_OK) != 0,  # Compressed
    ]
    if interval:
        filters += [
            Repository.id >= interval[0],
            Repository.id <= interval[1],
        ]

    query = (
        session.query(Repository)
        .filter(*filters)
    )

    if count:
        print(query.count())
        return

    if reverse:
        query = query.order_by(
            Repository.id.desc(),
        )
    else:
        query = query.order_by(
            Repository.id.asc(),
        )

    for repository in query:
        if check_exit(check):
            session.commit()
            vprint(0, 'Found .exit file. Exiting')
            return
        status.report()

        vprint(0, 'Processing repository: {}'.format(repository))
        with mount_basedir():
            result = process_repository(
                session, repository, skip_if_error,
            )
        vprint(1, result)
        status.count += 1
        session.commit()
Ejemplo n.º 14
0
def apply(
    session, status, selected_repositories, skip_if_error,
    count, interval, reverse, check
):
    while selected_repositories:
        filters = [
            Repository.processed.op("&")(consts.R_N_EXTRACTION) == 0, # no extraction
            Repository.processed.op("&")(skip_if_error) == 0, # no failure
        ]
        if selected_repositories is not True:
            filters += [
                Repository.id.in_(selected_repositories[:30])
            ]
            selected_repositories = selected_repositories[30:]
        else:
            selected_repositories = False
            if interval:
                filters += [
                    Repository.id >= interval[0],
                    Repository.id <= interval[1],
                ]

        query = session.query(Repository).filter(*filters)
        if count:
            print(query.count())
            return

        if reverse:
            query = query.order_by(
                Repository.id.desc()
            )
        else:
            query = query.order_by(
                Repository.id.asc()
            )

        for repository in query:
            if check_exit(check):
                vprint(0, "Found .exit file. Exiting")
                return
            status.report()
            vprint(0, "Extracting notebooks/cells from {}".format(repository))
            with mount_basedir():
                result = process_repository(session, repository, skip_if_error)
                vprint(0, result)
            status.count += 1
            session.commit()
Ejemplo n.º 15
0
def apply(session, status, count, interval, reverse, check):
    """Extract code cell features"""
    filters = [CellModule.local_possibility.is_(None)]
    if interval:
        filters += [
            CellModule.repository_id >= interval[0],
            CellModule.repository_id <= interval[1],
        ]

    query = (session.query(CellModule).filter(*filters))

    if count:
        print(query.count())
        return

    if reverse:
        query = query.order_by(
            CellModule.repository_id.desc(),
            CellModule.id.desc(),
        )
    else:
        query = query.order_by(
            CellModule.repository_id.asc(),
            CellModule.id.asc(),
        )

    skip_repo = False
    repository_id = None
    archives = None

    for cell_module in query:
        if check_exit(check):
            session.commit()
            vprint(0, 'Found .exit file. Exiting')
            return
        status.report()

        skip_repo, repository_id, archives = load_repository(
            session, cell_module, skip_repo, repository_id, archives)
        if skip_repo:
            continue

        vprint(1, 'Processing module: {}'.format(cell_module))
        result = process_cell_module(session, cell_module, archives)
        vprint(1, result)
        status.count += 1
    session.commit()
Ejemplo n.º 16
0
def execute_notebooks(
    status, session, cwd, notebooks_iter, mode, notebook_exec_mode, dry_run
):
    notebooks_iter = list(notebooks_iter)
    vprint(2, "{}Running {} notebooks".format(
        "[DRY RUN] " if dry_run >= 1 else "",
        len(notebooks_iter)
    ))
    if dry_run >= 1:
        return "done"

    for notebook, repository in notebooks_iter:
        status.count += 1
        status.report()
        nmode = notebook_exec_mode(mode, notebook, repository)
        if notebook.processed & (nmode.processed * 2):
            notebook.processed -= nmode.processed * 2

        mode_num = exec_to_num(*nmode)
        vprint(2, "Running notebook {}".format(notebook))
        pstatus = subprocess.call(
            '. {}/etc/profile.d/conda.sh '
            '&& conda activate {} '
            "&& python reproducemegit/jupyter_reproducibility/run_notebook.py -n {} -p '{}' -m {}"
            .format(
                config.ANACONDA_PATH, "work",
                notebook.id,
                str(cwd / notebook.name).replace("'", "'\\''"),
                mode_num
            ), shell=True,
        )
        error = pstatus != 0
        processed = nmode.processed * (2 if error else 1)
        vprint(2, "Status: {}. Mode: {}. Set Processed: {}".format(
            pstatus, mode_num, processed
        ))
        notebook.processed |= processed
        session.add(notebook)
        session.commit()
    return "done"
Ejemplo n.º 17
0
def apply(session, status, selected_repositories, processed, no, count,
          interval, reverse, check):
    while selected_repositories:
        filters = [
            Repository.processed.op("&")(processed) ==
            processed,  # no extraction
            Repository.processed.op("&")(no) == 0,  # no failure
        ]
        if selected_repositories is not True:
            filters += [Repository.id.in_(selected_repositories[:30])]
            selected_repositories = selected_repositories[30:]
        else:
            selected_repositories = False
            if interval:
                filters += [
                    Repository.id >= interval[0],
                    Repository.id <= interval[1],
                ]

        query = session.query(Repository).filter(*filters)
        if count:
            print(query.count())
            return

        if reverse:
            query = query.order_by(Repository.id.desc())
        else:
            query = query.order_by(Repository.id.asc())

        for repository in query:
            if check_exit(check):
                vprint(0, "Found .exit file. Exiting")
                return
            status.report()
            vprint(0, "Unzipping {}".format(repository))
            with mount_basedir():
                result = unzip_repository(session, repository)
                vprint(1, result)
            status.count += 1
            session.commit()
Ejemplo n.º 18
0
def execute_repository(
    status, session, repository, notebooks_iter, mode, env, skip_extract,
    notebook_exec_mode, dry_run, out, err,
):
    vprint(1, "Executing notebooks from {}".format(repository))
    if repository.processed & consts.R_UNAVAILABLE_FILES:
        repository.processed -= consts.R_UNAVAILABLE_FILES
        session.add(repository)
    if repository.processed & consts.R_COMPRESS_ERROR:
        repository.processed -= consts.R_COMPRESS_ERROR
        session.add(repository)

    cwd = config.EXECUTION_DIR / repository.hash_dir2
    vprint(2, "{}Preparing repository directory".format(
        "[DRY RUN] " if dry_run >= 3 else "",
    ))
    if dry_run < 3:
        with mount_umount(out, err):
            success, cwd, msg = extract_repository(
                session, repository, skip_extract, out, err
            )
            vprint(3, msg)
            if not success:
                return "Failed to extract repository"

    if mode.dependencies:
        msg = install_repository_dependencies(
            status, session, cwd, repository, notebooks_iter, mode, env,
            notebook_exec_mode, dry_run, out, err
        )
        if msg is not None:
            return msg

    return execute_notebooks(
        status, session, cwd, notebooks_iter, mode, notebook_exec_mode, dry_run
    )
Ejemplo n.º 19
0
def load_repository(session, cell_module, skip_repo, repository_id, archives):
    if repository_id != cell_module.repository_id:
        repository = cell_module.repository_obj
        try:
            session.commit()
        except Exception as err:
            vprint(
                0,
                'Failed to save modules from repository {} due to {}'.format(
                    repository_id, err))

        vprint(0, 'Processing repository: {}'.format(repository))
        if not repository.processed & consts.R_EXTRACTED_FILES:
            vprint(1, 'Skipping. Files not extracted from repository')
            return True, cell_module.repository_id, None

        archives = {
            fil.path
            for fil in session.query(RepositoryFile).filter(
                RepositoryFile.repository_id == repository.id)
        }
        return False, cell_module.repository_id, archives

    return skip_repo, repository_id, archives
Ejemplo n.º 20
0
def apply(session, status, keep, count, interval, reverse, check):
    """Compress repositories"""
    filters = [
        Repository.processed.op('&')(consts.R_COMPRESS_OK) == 0,
    ]
    if interval:
        filters += [
            Repository.id >= interval[0],
            Repository.id <= interval[1],
        ]
    query = session.query(Repository).filter(*filters)
    if count:
        print(query.count())
        return

    if reverse:
        query = query.order_by(Repository.id.desc())
    else:
        query = query.order_by(Repository.id.asc())

    for repository in query:
        if check_exit(check):
            vprint(0, "Found .exit file. Exiting")
            return
        status.report()
        vprint(0, "Compressing {}".format(repository))
        vprint(1, "Into {}".format(repository.zip_path))
        with mount_basedir():
            try:
                if repository.path.exists():
                    commit = repository.get_commit()
                    if commit != repository.commit:
                        repository.processed |= consts.R_COMMIT_MISMATCH

                repository.processed |= consts.R_COMPRESS_ERROR
                if repository.zip_path.exists() or repository.compress():
                    if repository.processed & consts.R_COMPRESS_ERROR:
                        repository.processed -= consts.R_COMPRESS_ERROR
                    if not keep:
                        shutil.rmtree(str(repository.path), ignore_errors=True)
                elif not repository.zip_path.exists():
                    if repository.processed & consts.R_COMPRESS_ERROR:
                        repository.processed -= consts.R_COMPRESS_ERROR
                    if not repository.path.exists():
                        repository.processed |= consts.R_UNAVAILABLE_FILES
                    vprint(1, "failed")
                if repository.zip_path.exists():
                    vprint(1, "ok")
                    repository.processed |= consts.R_COMPRESS_OK
            except Exception as err:
                vprint(1, "Failed: {}".format(err))
        session.add(repository)
        status.count += 1
        session.commit()
Ejemplo n.º 21
0
def load_notebook(repository_id, path, notebook_file, nbrow):
    """Extract notebook information and cells from notebook"""
    # pylint: disable=too-many-locals
    status = 0
    try:
        with open(str(path / notebook_file)) as ofile:
            notebook = nbf.read(ofile, nbf.NO_CONVERT)
        nbrow["nbformat"] = "{0[nbformat]}".format(notebook)
        if "nbformat_minor" in notebook:
            nbrow["nbformat"] += ".{0[nbformat_minor]}".format(notebook)
        notebook = nbf.convert(notebook, 4)
        metadata = notebook["metadata"]
    except OSError as e:
        vprint(3, "Failed to open notebook {}".format(e))
        nbrow["processed"] = consts.N_LOAD_ERROR
        if os.path.islink(str(path / notebook_file)):
            import textwrap
            vprint(3, "Notebook is broken link. Use the following SQL to fix:")
            text = (textwrap.dedent("""\
            select notebooks_count, (char_length(newtext) - char_length(replace(newtext, '''', ''))), concat(
                'update repositories ',
                'set notebooks_count = ',
                (char_length(newtext) - char_length(replace(newtext, ';', ''))) + 1,
                ', notebooks = ''',
                newtext,
                ''' where id = ',
                id,
                ';'
            ) from (
                select id, notebooks_count, replace(
                    replace(
                        replace(
                            notebooks,
                            '{0};', ''
                        ),
                        ';{0}', ''
                    ),
                    '''', ''''''
                ) as newtext
                from repositories where id = {1}
            ) as foo;
            """.format(notebook_file, repository_id)))
            text = " ".join(x.strip() for x in text.split("\n"))
            print(text)
        return nbrow, []

    except Exception as e:  # pylint: disable=broad-except
        vprint(3, "Failed to load notebook {}".format(e))
        nbrow["processed"] = consts.N_LOAD_FORMAT_ERROR
        return nbrow, []

    nbrow["kernel"] = metadata.get("kernelspec", {}).get("name", "no-kernel")

    language_info = metadata.get("language_info", {})
    nbrow["language"] = language_info.get("name", "unknown")
    nbrow["language_version"] = language_info.get("version", "unknown")
    shell = InteractiveShell.instance()
    is_python = nbrow["language"] == "python"
    is_unknown_version = nbrow["language_version"] == "unknown"

    cells = notebook["cells"]
    cells_info = []
    exec_count = -1
    for index, cell in enumerate(cells):
        vprint(3, "Loading cell {}".format(index))
        cell_exec_count = cell.get("execution_count") or -1
        if isinstance(cell_exec_count, str) and cell_exec_count.isdigit():
            cell_exec_count = int(cell_exec_count)
        if isinstance(cell_exec_count, int):
            exec_count = max(exec_count, cell_exec_count)
        output_formats = ";".join(set(cell_output_formats(cell)))

        cell_processed = consts.C_OK
        if is_unknown_version:
            cell_processed = consts.C_UNKNOWN_VERSION

        try:
            source = cell["source"] = cell["source"] or ""
            if is_python and cell.get("cell_type") == "code":
                try:
                    source = shell.input_transformer_manager.transform_cell(source)
                except (IndentationError, SyntaxError) as err:
                    vprint(3, "Error on cell transformation: {}".format(err))
                    source = ""
                    status = consts.N_LOAD_SYNTAX_ERROR
                    cell_processed |= consts.C_SYNTAX_ERROR
                if "\0" in source:
                    vprint(3, "Found null byte in source. Replacing it by \\n")
                    source = source.replace("\0", "\n")

            cellrow = {
                "repository_id": repository_id,
                "notebook_id": None,
                "index": index,
                "cell_type": cell.get("cell_type", "<unknown>"),
                "execution_count": cell.get("execution_count"),
                "lines": cell["source"].count("\n") + 1,
                "output_formats": output_formats,
                "source": source,
                "python": is_python,
                "processed": cell_processed,
            }
            cells_info.append(cellrow)
            nbrow["total_cells"] += 1
            if cell.get("cell_type") == "code":
                nbrow["code_cells"] += 1
                if output_formats:
                    nbrow["code_cells_with_output"] += 1
            elif cell.get("cell_type") == "markdown":
                nbrow["markdown_cells"] += 1
            elif cell.get("cell_type") == "raw":
                nbrow["raw_cells"] += 1
            else:
                nbrow["unknown_cell_formats"] += 1
            if not cell["source"].strip():
                nbrow["empty_cells"] += 1
        except KeyError as err:
            vprint(3, "Error on cell extraction: {}".format(err))
            status = consts.N_LOAD_FORMAT_ERROR
    if nbrow["total_cells"] == 0:
        status = consts.N_LOAD_FORMAT_ERROR

    nbrow["max_execution_count"] = exec_count
    nbrow["processed"] = status
    return nbrow, cells_info
Ejemplo n.º 22
0
def load_repository(session, domain, repo, check_repo_only=True, branch=None,
                    commit=None, clone_existing=False):
    """Clone repository and extract its information"""
    vprint(0, "Processing repository: {}".format(repo))
    if check_repo_only:
        repository = session.query(Repository).filter(
            Repository.domain == domain,
            Repository.repository == repo,
        ).first()
        if repository is not None:
            vprint(1, "Repository exists: ID={}".format(repository.id))
            if not clone_existing:
                return repository
    part, end = extract_hash_parts(repo)
    remote = get_remote(domain, repo)
    vprint(1, "Remote: {}".format(remote))
    full_dir = clone(part, end, repo, remote, branch, commit)

    commit = git_output(
        "rev-parse", "HEAD", cwd=str(full_dir)
    ).decode("utf-8").strip()

    repository = session.query(Repository).filter(
        Repository.domain == domain,
        Repository.repository == repo,
        Repository.commit == commit,
    ).first()
    if repository is not None:
        if not check_repo_only:
            vprint(1, "Repository exists: ID={}".format(repository.id))
        # vprint(1, "> Removing .git directory")
        # shutil.rmtree(str(repository.path / ".git"), ignore_errors=True)
        return repository

    vprint(1, "Finding files")
    notebooks = [
        file.relative_to(full_dir)
        for file in find_files(full_dir, "*.ipynb")
        if ".ipynb_checkpoints" not in str(file)
    ]

    setups, requirements, pipfiles, pipfile_locks = find_files_in_path(
        full_dir, [
            "setup.py", "requirements.txt", "Pipfile", "Pipfile.lock"
        ]
    )

    repository = Repository(
        domain=domain, repository=repo,
        hash_dir1=part, hash_dir2=end,
        commit=commit,

        notebooks_count=len(notebooks),
        setups_count=len(setups),
        requirements_count=len(requirements),
        pipfiles_count=len(pipfiles),
        pipfile_locks_count=len(pipfile_locks),

        notebooks=join_paths(notebooks),
        setups=join_paths(setups),
        requirements=join_paths(requirements),
        pipfiles=join_paths(pipfiles),
        pipfile_locks=join_paths(pipfile_locks),

        processed=consts.R_OK,
    )
    session.add(repository)
    session.commit()
    # vprint("Removing .git directory")
    # shutil.rmtree(str(repository.path / ".git"), ignore_errors=True)
    vprint(1, "Done. ID={}".format(repository.id))

    return repository
Ejemplo n.º 23
0
def process_code_cell(
    session,
    repository_id,
    notebook_id,
    cell,
    checker,
    skip_if_error=consts.C_PROCESS_ERROR,
    skip_if_syntaxerror=consts.C_SYNTAX_ERROR,
    skip_if_timeout=consts.C_TIMEOUT,
):
    """Process Markdown Cell to collect features"""
    if cell.processed & consts.C_PROCESS_OK:
        return 'already processed'

    retry = False
    retry |= not skip_if_error and cell.processed & consts.C_PROCESS_ERROR
    retry |= not skip_if_syntaxerror and cell.processed & consts.C_SYNTAX_ERROR
    retry |= not skip_if_timeout and cell.processed & consts.C_TIMEOUT

    if retry:
        deleted = (session.query(CellFeature).filter(
            CellFeature.cell_id == cell.id).delete() +
                   session.query(CellModule).filter(
                       CellModule.cell_id == cell.id).delete() +
                   session.query(CellName).filter(
                       CellName.cell_id == cell.id).delete() +
                   session.query(CodeAnalysis).filter(
                       CodeAnalysis.cell_id == cell.id).delete())
        if deleted:
            vprint(2, "Deleted {} rows".format(deleted))
        if cell.processed & consts.C_PROCESS_ERROR:
            cell.processed -= consts.C_PROCESS_ERROR
        if cell.processed & consts.C_SYNTAX_ERROR:
            cell.processed -= consts.C_SYNTAX_ERROR
        if cell.processed & consts.C_TIMEOUT:
            cell.processed -= consts.C_TIMEOUT
        session.add(cell)

    try:
        error = False
        try:
            vprint(2, "Extracting features")
            analysis, modules, features, names = extract_features(
                cell.source, checker)
            processed = consts.A_OK
        except TimeoutError:
            processed = consts.A_TIMEOUT
            cell.processed |= consts.C_TIMEOUT
            error = True
        except SyntaxError:
            processed = consts.A_SYNTAX_ERROR
            cell.processed |= consts.C_SYNTAX_ERROR
            error = True
        if error:
            vprint(3, "Failed: {}".format(processed))
            analysis = {
                x.name: 0
                for x in CodeAnalysis.__table__.columns if x.name not in
                {"id", "repository_id", "notebook_id", "cell_id", "index"}
            }
            analysis["ast_others"] = ""
            modules = []
            features = []
            names = {}
        else:
            vprint(3, "Ok")

        analysis["processed"] = processed

        code_analysis = CodeAnalysis(repository_id=repository_id,
                                     notebook_id=notebook_id,
                                     cell_id=cell.id,
                                     index=cell.index,
                                     **analysis)
        dependents = []
        for line, import_type, module_name, local in modules:
            dependents.append(
                CellModule(
                    repository_id=repository_id,
                    notebook_id=notebook_id,
                    cell_id=cell.id,
                    index=cell.index,
                    line=line,
                    import_type=import_type,
                    module_name=module_name,
                    local=local,
                ))

        for line, column, feature_name, feature_value in features:
            dependents.append(
                CellFeature(
                    repository_id=repository_id,
                    notebook_id=notebook_id,
                    cell_id=cell.id,
                    index=cell.index,
                    line=line,
                    column=column,
                    feature_name="IPython/" + feature_name,
                    feature_value=feature_value,
                ))

        for (scope, context), values in names.items():
            for name, count in values.items():
                dependents.append(
                    CellName(
                        repository_id=repository_id,
                        notebook_id=notebook_id,
                        cell_id=cell.id,
                        index=cell.index,
                        scope=scope,
                        context=context,
                        name=name,
                        count=count,
                    ))
        vprint(2, "Adding session objects")
        session.dependent_add(code_analysis, dependents, "analysis_id")
        cell.processed |= consts.C_PROCESS_OK
        return "done"
    except Exception as err:
        cell.processed |= consts.C_PROCESS_ERROR
        if config.VERBOSE > 4:
            import traceback
            traceback.print_exc()
        return 'Failed to process ({})'.format(err)
    finally:
        session.add(cell)
Ejemplo n.º 24
0
def apply(session, status, dispatches, selected_notebooks, skip_if_error,
          skip_if_syntaxerror, skip_if_timeout, count, interval, reverse,
          check):
    """Extract code cell features"""
    while selected_notebooks:
        filters = [
            Cell.processed.op('&')(consts.C_PROCESS_OK) == 0,
            Cell.processed.op('&')(skip_if_error) == 0,
            Cell.processed.op('&')(skip_if_syntaxerror) == 0,
            Cell.processed.op('&')(skip_if_timeout) == 0,
            Cell.processed.op('&')(
                consts.C_UNKNOWN_VERSION) == 0,  # known version
            Cell.cell_type == 'code',
            Cell.python.is_(True),
        ]
        if selected_notebooks is not True:
            filters += [Cell.notebook_id.in_(selected_notebooks[:30])]
            selected_notebooks = selected_notebooks[30:]
        else:
            selected_notebooks = False
            if interval:
                filters += [
                    Cell.repository_id >= interval[0],
                    Cell.repository_id <= interval[1],
                ]

        query = (session.query(Cell).filter(*filters))

        if count:
            print(query.count())
            return

        if reverse:
            query = query.order_by(
                Cell.repository_id.desc(),
                Cell.notebook_id.asc(),
                Cell.index.asc(),
            )
        else:
            query = query.order_by(
                Cell.repository_id.asc(),
                Cell.notebook_id.asc(),
                Cell.index.asc(),
            )

        skip_repo = False
        repository_id = None
        repository = None
        archives = None

        skip_notebook = False
        notebook_id = None
        checker = None

        for cell in query:
            if check_exit(check):
                session.commit()
                vprint(0, 'Found .exit file. Exiting')
                return
            status.report()

            with mount_basedir():
                skip_repo, repository_id, repository, archives = load_repository(
                    session, cell, skip_repo, repository_id, repository,
                    archives)
                if skip_repo:
                    continue

                skip_repo, skip_notebook, notebook_id, archives, checker = load_notebook(
                    session, cell, dispatches, repository, skip_repo,
                    skip_notebook, notebook_id, archives, checker)
                if skip_repo or skip_notebook:
                    continue

                vprint(2, 'Processing cell: {}'.format(cell))
                result = process_code_cell(
                    session,
                    repository_id,
                    notebook_id,
                    cell,
                    checker,
                    skip_if_error,
                    skip_if_syntaxerror,
                    skip_if_timeout,
                )
                vprint(2, result)
            status.count += 1
        session.commit()
Ejemplo n.º 25
0
def process_repository(session, repository, skip_if_error=consts.R_N_ERROR):
    """Process repository"""
    if repository.processed & (consts.R_N_EXTRACTION + skip_if_error):
        return "already processed"
    if repository.processed & consts.R_N_ERROR:
        session.add(repository)
        repository.processed -= consts.R_N_ERROR

    count = 0
    for name in repository.notebook_names:
        if not name:
            continue
        count += 1
        notebook = session.query(Notebook).filter(
            Notebook.repository_id == repository.id,
            Notebook.name == name,
        ).first()
        if notebook is not None:
            if notebook.processed & consts.N_STOPPED:
                session.delete(notebook)
                session.commit()
            else:
                if notebook.processed & consts.N_GENERIC_LOAD_ERROR:
                    count -= 1
                    vprint(2, "Notebook already exists. Delete from DB: {}".format(notebook))
                    with open(str(config.LOGS_DIR / "todo_delete"), "a") as f:
                        f.write("{},".format(notebook.id))

                continue  # Skip working notebook

        if not repository.path.exists():
            vprint(2, "Unzipping repository: {}".format(repository.zip_path))
            msg = unzip_repository(session, repository)
            if msg != "done":
                vprint(2, msg)
                return "failed"

        try:
            vprint(2, "Loading notebook {}".format(name))
            nbrow = {
                "repository_id": repository.id,
                "name": name,
                "nbformat": 0,
                "kernel": "no-kernel",
                "language": "unknown",
                "language_version": "unknown",
                "max_execution_count": 0,
                "total_cells": 0,
                "code_cells": 0,
                "code_cells_with_output": 0,
                "markdown_cells": 0,
                "raw_cells": 0,
                "unknown_cell_formats": 0,
                "empty_cells": 0,
                "processed": consts.N_OK,
            }
            try:
                nbrow, cells = load_notebook(repository.id, repository.path, name, nbrow)
            except TimeoutError:
                nbrow["processed"] = consts.N_LOAD_TIMEOUT
                cells = []
            nbrow["processed"] |= consts.N_STOPPED
            notebook = Notebook(**nbrow)
            session.dependent_add(
                notebook, [Cell(**cellrow) for cellrow in cells], "notebook_id"
            )

        except Exception as err:  # pylint: disable=broad-except
            repository.processed |= consts.R_N_ERROR
            session.add(repository)
            vprint(1, "Failed to load notebook {} due {!r}".format(name, err))
            if config.VERBOSE > 4:
                import traceback
                traceback.print_exc()

    if not repository.processed & consts.R_N_ERROR and count == repository.notebooks_count:
        repository.processed |= consts.R_N_EXTRACTION
        session.add(repository)

    status, err = session.commit()
    if not status:
        if repository.processed & consts.R_N_EXTRACTION:
            repository.processed -= consts.R_N_EXTRACTION
        if not repository.processed & consts.R_N_ERROR:
            repository.processed += consts.R_N_ERROR
        session.add(repository)
        session.commit()
        return "Failed due {!r}".format(err)

    return "done"
Ejemplo n.º 26
0
def install_repository_dependencies(
    status, session, cwd, repository, notebooks_iter, mode, env,
    notebook_exec_mode, dry_run, out, err
):
    vprint(2, "{}Installing repository dependencies".format(
        "[DRY RUN] " if dry_run >= 2 else ""
    ))
    if dry_run >= 2:
        return None

    install_options = [
        ("setup.py", install_setups, repository.setup_names),
        ("requirements.txt", install_requirements, repository.requirement_names),
        ("Pipfile", install_pipfiles, repository.pipfile_names),
        ("Pipfile.lock", install_pipfiles, repository.pipfile_lock_names),
    ]
    installed = True
    data_ok_list = []
    data_failed_list = []
    data_failed = b""
    for spec, func, names in install_options:
        success, data = func(cwd, names, "work", out, err)
        installed = installed and success
        spec_bytes = spec.encode("utf-8")
        if success:
            data_ok_list.append(spec_bytes)
        else:
            data_failed += b"\n##<<>>##" + spec_bytes + b":\n" + data
            data_failed_list.append(spec_bytes)
    if not installed:
        reason = "<Install Dependency Error>"
        cause = b"Ok: " + b", ".join(data_ok_list)
        cause += b"\n##<<>>##Failed: " + b", ".join(data_failed_list)
        cause += data_failed
        for notebook, repository in notebooks_iter:
            status.skipped += 1
            status.report()
            nmode = notebook_exec_mode(mode, notebook, repository)
            mode_num = exec_to_num(*nmode)
            execution = session.query(Execution).filter(
                Execution.notebook_id == notebook.id,
                Execution.mode == mode_num,
            ).first()
            if execution:
                if execution.processed & consts.E_EXECUTED:
                    continue
                execution.reason = reason
                execution.msg = cause
                execution.cell = None
                execution.count = None
                execution.diff = None
                execution.duration = None
                execution.timeout = None
                execution.diff_count = None
                execution.processed = consts.E_CREATED
            else:
                execution = Execution(
                    notebook_id=notebook.id, mode=mode_num,
                    reason=reason, msg=cause,
                    processed=consts.E_CREATED,
                    repository_id=notebook.repository_id,
                )
            session.add(execution)
            notebook.processed |= nmode.processed
            session.add(notebook)
        session.commit()
        return "Failed to install {}".format(
            b", ".join(data_failed_list).decode("utf-8")
        )
    return None
Ejemplo n.º 27
0
def process_requirement_file(session,
                             repository,
                             reqformat,
                             skip_if_error=consts.R_REQUIREMENTS_ERROR):
    """Process requirement file"""
    MAP = {
        "setup.py": "setup",
        "requirements.txt": "requirement",
        "Pipfile": "pipfile",
        "Pipfile.lock": "pipfile_lock"
    }
    zip_path = None
    tarzip = None
    if not repository.path.exists():
        if not repository.zip_path.exists():
            repository.processed |= consts.R_UNAVAILABLE_FILES
            session.add(repository)
            vprint(
                1, "Failed to load requirement {} due <repository not found>".
                format(reqformat))
            return False
        tarzip = tarfile.open(str(repository.zip_path))
        zip_path = Path(repository.hash_dir2)
    finished = True
    req_param = MAP[reqformat] + "_names"
    for name in getattr(repository, req_param):
        if not name:
            continue
        try:
            vprint(2, "Loading requirement {}".format(name))
            if tarzip:
                content = tarzip.extractfile(
                    tarzip.getmember(str(zip_path / name))).read()
            else:
                with open(str(repository.path / name), "rb") as ofile:
                    content = ofile.read()

            coding = chardet.detect(content)
            if coding["encoding"] is None:
                vprint(3, "Codec not detected")
                continue
            try:
                content = content.decode(coding['encoding'])
            except UnicodeDecodeError:
                vprint(3, "Invalid codec")
                continue

            if '\0' in content:
                vprint(3, "NULL byte in content")
                continue
            requirement_file = RequirementFile(
                repository_id=repository.id,
                name=name,
                reqformat=reqformat,
                content=content,
                processed=consts.F_OK,
            )
            session.add(requirement_file)
        except Exception as err:
            repository.processed |= skip_if_error
            session.add(repository)
            vprint(1,
                   "Failed to load requirement {} due {!r}".format(name, err))
            if config.VERBOSE > 4:
                import traceback
                traceback.print_exc()
            finished = False
    if tarzip:
        tarzip.close()
    return finished
Ejemplo n.º 28
0
def extract_repository(session, repository, skip_extract, out, err):
    cwd = config.EXECUTION_DIR
    if skip_extract:
        cwd = (config.EXECUTION_DIR / repository.hash_dir2)
        if not cwd.exists():
            return (
                False, cwd,
                "Failed to use extracted dir. It does not exists"
            )
    else:
        try:
            if config.EXECUTION_DIR.exists():
                shutil.rmtree(str(config.EXECUTION_DIR), ignore_errors=True)
            if repository.zip_path.exists():
                config.EXECUTION_DIR.mkdir(parents=True, exist_ok=True)
                cmd = repository.uncompress(config.EXECUTION_DIR, return_cmd=True)
                vprint(3, "Extract: {}".format(repository.zip_path))
                vprint(3, "Command: {}".format(" ".join(cmd)))
                uncompressed = subprocess.call(cmd, stdout=out, stderr=err)
                if uncompressed != 0:
                    repository.processed |= consts.R_COMPRESS_ERROR
                    session.commit()
                    return (
                        False, cwd,
                        "Extraction failed with code {}".format(uncompressed),
                    )
            elif repository.path.exists():
                new_path = (config.EXECUTION_DIR / repository.hash_dir2)
                new_path.mkdir(parents=True, exist_ok=True)
                cmd = "tar cf - * | (cd {} ; tar xf - )".format(str(new_path))
                vprint(3, "Copy: {}".format(repository.path))
                vprint(3, "Command: {}".format(cmd))
                copied = subprocess.call(
                    cmd, shell=True, stdout=out, stderr=err, cwd=str(repository.path)
                )
                if copied != 0:
                    repository.processed |= consts.R_COMPRESS_ERROR
                    session.commit()
                    return (
                        False, cwd,
                        "Copying failed with code {}".format(copied),
                    )
            else:
                repository.processed |= consts.R_UNAVAILABLE_FILES
                session.add(repository)
                session.commit()
                return (
                    False, cwd,
                    "Failed to find repository"
                )
            files = [sub for sub in cwd.glob("*")]
            sub_cwd = cwd / repository.hash_dir2
            if files == [sub_cwd]:
                cwd = sub_cwd
            else:
                return (
                    False, cwd,
                    "Execution dir is full"
                )

        except Exception as e:
            repository.processed |= consts.R_COMPRESS_ERROR
            session.add(repository)
            session.commit()
            return (
                False, cwd,
                "Copy failed with exception {}".format(e),
            )

    commit = repository.get_commit(cwd)
    if commit != repository.commit:
        repository.processed |= consts.R_COMMIT_MISMATCH
        session.add(repository)
        return (
            False, cwd,
            "Commit mismatch. Expected {}. Found {}".format(
                repository.commit, commit
            ),
        )
    return (
        True, cwd,
        "Repository set to {}".format(cwd)
    )
Ejemplo n.º 29
0
def apply(
    session, repository_id, status, script_name, execution_mode, with_execution, with_dependency,
    skip_if_error, skip_if_error_mode, skip_if_troublesome, try_to_discover_files,
    skip_env, skip_extract, dry_run, mode_rules, notebook_exec_mode,
    count, interval, reverse, check
):
    """Execute repositories"""
    mode_def = None if execution_mode == -1 else EXECUTION_MODE[execution_mode]

    filters = [
        Notebook.language == "python",
        Notebook.language_version != "unknown",
        func.length(Notebook.language_version) > 3,
        Repository.processed.op('&')(try_to_discover_files) == 0,
        Repository.processed.op('&')(consts.R_FAILED_TO_CLONE) == 0,
        Repository.processed.op('&')(skip_if_error) == 0,
        Repository.processed.op('&')(skip_if_troublesome) == 0,
        Repository.id == repository_id,
    ]

    if interval:
        filters += [
            Repository.id >= interval[0],
            Repository.id <= interval[-1]
        ]

    filters += EXECUTION_RULES[with_execution]
    filters += DEPENDENCY_RULES[with_dependency]

    if mode_def is None:
        filters += mode_rules(
            with_execution, with_dependency, skip_if_error_mode
        )
    else:
        filters.append(
            Notebook.processed.op('&')(
                mode_def.processed * skip_if_error_mode
            ) == 0
        )

    query = (
        session.query(Notebook, Repository)
        .join(Repository)
        .filter(*filters)
    )
    if count:
        print(query.count())
        return

    if reverse:
        query = query.order_by(
            (Repository.setups_count + Repository.requirements_count
            + Repository.pipfile_locks_count + Repository.pipfiles_count) > 0,
            Notebook.language_version.asc(),
            Repository.id.desc()
        )
    else:
        query = query.order_by(
            (Repository.setups_count + Repository.requirements_count
            + Repository.pipfile_locks_count + Repository.pipfiles_count) > 0,
            Notebook.language_version.asc(),
            Repository.id.asc()
        )

    moment = datetime.now().strftime("%Y%m%dT%H%M%S")
    config.LOGS_DIR.mkdir(parents=True, exist_ok=True)
    outf = str(config.LOGS_DIR / ("sub-{}-{}.out".format(script_name, moment)))
    errf = str(config.LOGS_DIR / ("sub-{}-{}.err".format(script_name, moment)))

    with open(outf, "wb") as out, open(errf, "wb") as err:

        group = groupby(
            query, lambda x: (
                x[0].language_version[:3], notebook_exec_mode(mode_def, *x)
            )
        )
        last = None
        for (version, mode), query_iter in group:
            status.report()
            vnum = version_string_to_list(version)
            envs = config.VERSIONS if mode.anaconda else config.RAW_VERSIONS
            env = best_match(vnum, envs)
            group = groupby(
                query_iter,
                lambda x: (x[1])
            )
            for repository, notebook_iter in group:
                if check_exit(check):
                    vprint(0, "Found .exit file. Exiting")
                    return
                current = (env, repository) if mode.dependencies else env
                if last != current:
                    prepared = prepare_environment(
                        session, env, mode, version, notebook_iter,
                        mode_def, skip_env, notebook_exec_mode, dry_run, out, err
                    )
                    if not prepared:
                        continue
                last = None if mode.dependencies else current
                result = execute_repository(
                    status, session, repository, notebook_iter,
                    mode, env, skip_extract, notebook_exec_mode, dry_run, out, err
                )
                vprint(2, result)
                session.commit()