Example #1
0
def apply(session, status, skip_if_error, count, interval, reverse, check):
    """Extract markdown features"""
    filters = [
        Cell.processed.op('&')(consts.C_PROCESS_OK) == 0,
        Cell.processed.op('&')(skip_if_error) == 0,
        Cell.cell_type == 'markdown',
    ]
    if interval:
        filters += [
            Cell.repository_id >= interval[0],
            Cell.repository_id <= interval[1],
        ]

    query = (
        session.query(Cell)
        .filter(*filters)
    )

    if count:
        print(query.count())
        return

    if reverse:
        query = query.order_by(
            Cell.repository_id.desc(),
            Cell.notebook_id.asc(),
            Cell.index.asc(),
        )
    else:
        query = query.order_by(
            Cell.repository_id.asc(),
            Cell.notebook_id.asc(),
            Cell.index.asc(),
        )

    repository_id = None
    notebook_id = None
    for cell in query:
        if check_exit(check):
            vprint(0, 'Found .exit file. Exiting')
            return
        status.report()
        if repository_id != cell.repository_id:
            session.commit()
            repository_id = cell.repository_id
            vprint(0, 'Processing repository: {}'.format(repository_id))
        if notebook_id != cell.notebook_id:
            notebook_id = cell.notebook_id
            vprint(1, 'Processing notebook: {}'.format(notebook_id))
        vprint(2, 'Processing cell: {}/[{}]'.format(cell.id, cell.index))
        result = process_markdown_cell(
            session, repository_id, notebook_id, cell, skip_if_error
        )
        vprint(2, result)
        status.count += 1
    session.commit()
Example #2
0
def apply(session, status, keep, count, interval, reverse, check):
    """Compress repositories"""
    filters = [
        Repository.processed.op('&')(consts.R_COMPRESS_OK) == 0,
    ]
    if interval:
        filters += [
            Repository.id >= interval[0],
            Repository.id <= interval[1],
        ]
    query = session.query(Repository).filter(*filters)
    if count:
        print(query.count())
        return

    if reverse:
        query = query.order_by(Repository.id.desc())
    else:
        query = query.order_by(Repository.id.asc())

    for repository in query:
        if check_exit(check):
            vprint(0, "Found .exit file. Exiting")
            return
        status.report()
        vprint(0, "Compressing {}".format(repository))
        vprint(1, "Into {}".format(repository.zip_path))
        with mount_basedir():
            try:
                if repository.path.exists():
                    commit = repository.get_commit()
                    if commit != repository.commit:
                        repository.processed |= consts.R_COMMIT_MISMATCH

                repository.processed |= consts.R_COMPRESS_ERROR
                if repository.zip_path.exists() or repository.compress():
                    if repository.processed & consts.R_COMPRESS_ERROR:
                        repository.processed -= consts.R_COMPRESS_ERROR
                    if not keep:
                        shutil.rmtree(str(repository.path), ignore_errors=True)
                elif not repository.zip_path.exists():
                    if repository.processed & consts.R_COMPRESS_ERROR:
                        repository.processed -= consts.R_COMPRESS_ERROR
                    if not repository.path.exists():
                        repository.processed |= consts.R_UNAVAILABLE_FILES
                    vprint(1, "failed")
                if repository.zip_path.exists():
                    vprint(1, "ok")
                    repository.processed |= consts.R_COMPRESS_OK
            except Exception as err:
                vprint(1, "Failed: {}".format(err))
        session.add(repository)
        status.count += 1
        session.commit()
def apply(
    session, status, skip_if_error,
    count, interval, reverse, check
):
    """Extract code cell features"""
    filters = [
        Notebook.processed.op("&")(consts.N_AGGREGATE_OK) == 0,
        Notebook.processed.op("&")(skip_if_error) == 0,
        Notebook.processed.op("&")(consts.N_GENERIC_LOAD_ERROR) == 0,
    ]
    if interval:
        filters += [
            Notebook.repository_id >= interval[0],
            Notebook.repository_id <= interval[1],
        ]

    query = (
        session.query(Notebook)
        .filter(*filters)
    )

    if count:
        print(query.count())
        return

    if reverse:
        query = query.order_by(
            Notebook.repository_id.desc(),
            Notebook.id.desc(),
        )
    else:
        query = query.order_by(
            Notebook.repository_id.asc(),
            Notebook.id.asc(),
        )

    repository_id = None

    for notebook in query:
        if check_exit(check):
            session.commit()
            vprint(0, 'Found .exit file. Exiting')
            return
        status.report()

        repository_id = load_repository(session, notebook, repository_id)

        vprint(1, 'Processing notebook: {}'.format(notebook))
        result = process_notebook(session, notebook, skip_if_error)
        vprint(1, result)
        status.count += 1
    session.commit()
Example #4
0
def apply(
    session, status,
    skip_if_error,
    count, interval, reverse, check
):
    """Extract code cell features"""
    filters = [
        Repository.processed.op('&')(consts.R_EXTRACTED_FILES) == 0,
        Repository.processed.op('&')(skip_if_error) == 0,
        Repository.processed.op('&')(consts.R_COMPRESS_OK) != 0,  # Compressed
    ]
    if interval:
        filters += [
            Repository.id >= interval[0],
            Repository.id <= interval[1],
        ]

    query = (
        session.query(Repository)
        .filter(*filters)
    )

    if count:
        print(query.count())
        return

    if reverse:
        query = query.order_by(
            Repository.id.desc(),
        )
    else:
        query = query.order_by(
            Repository.id.asc(),
        )

    for repository in query:
        if check_exit(check):
            session.commit()
            vprint(0, 'Found .exit file. Exiting')
            return
        status.report()

        vprint(0, 'Processing repository: {}'.format(repository))
        with mount_basedir():
            result = process_repository(
                session, repository, skip_if_error,
            )
        vprint(1, result)
        status.count += 1
        session.commit()
Example #5
0
def apply(session, status, count, interval, reverse, check):
    """Extract code cell features"""
    filters = [CellModule.local_possibility.is_(None)]
    if interval:
        filters += [
            CellModule.repository_id >= interval[0],
            CellModule.repository_id <= interval[1],
        ]

    query = (session.query(CellModule).filter(*filters))

    if count:
        print(query.count())
        return

    if reverse:
        query = query.order_by(
            CellModule.repository_id.desc(),
            CellModule.id.desc(),
        )
    else:
        query = query.order_by(
            CellModule.repository_id.asc(),
            CellModule.id.asc(),
        )

    skip_repo = False
    repository_id = None
    archives = None

    for cell_module in query:
        if check_exit(check):
            session.commit()
            vprint(0, 'Found .exit file. Exiting')
            return
        status.report()

        skip_repo, repository_id, archives = load_repository(
            session, cell_module, skip_repo, repository_id, archives)
        if skip_repo:
            continue

        vprint(1, 'Processing module: {}'.format(cell_module))
        result = process_cell_module(session, cell_module, archives)
        vprint(1, result)
        status.count += 1
    session.commit()
Example #6
0
def apply(
    session, status, selected_repositories, skip_if_error,
    count, interval, reverse, check
):
    while selected_repositories:
        filters = [
            Repository.processed.op("&")(consts.R_N_EXTRACTION) == 0, # no extraction
            Repository.processed.op("&")(skip_if_error) == 0, # no failure
        ]
        if selected_repositories is not True:
            filters += [
                Repository.id.in_(selected_repositories[:30])
            ]
            selected_repositories = selected_repositories[30:]
        else:
            selected_repositories = False
            if interval:
                filters += [
                    Repository.id >= interval[0],
                    Repository.id <= interval[1],
                ]

        query = session.query(Repository).filter(*filters)
        if count:
            print(query.count())
            return

        if reverse:
            query = query.order_by(
                Repository.id.desc()
            )
        else:
            query = query.order_by(
                Repository.id.asc()
            )

        for repository in query:
            if check_exit(check):
                vprint(0, "Found .exit file. Exiting")
                return
            status.report()
            vprint(0, "Extracting notebooks/cells from {}".format(repository))
            with mount_basedir():
                result = process_repository(session, repository, skip_if_error)
                vprint(0, result)
            status.count += 1
            session.commit()
def apply(session, status, selected_repositories, processed, no, count,
          interval, reverse, check):
    while selected_repositories:
        filters = [
            Repository.processed.op("&")(processed) ==
            processed,  # no extraction
            Repository.processed.op("&")(no) == 0,  # no failure
        ]
        if selected_repositories is not True:
            filters += [Repository.id.in_(selected_repositories[:30])]
            selected_repositories = selected_repositories[30:]
        else:
            selected_repositories = False
            if interval:
                filters += [
                    Repository.id >= interval[0],
                    Repository.id <= interval[1],
                ]

        query = session.query(Repository).filter(*filters)
        if count:
            print(query.count())
            return

        if reverse:
            query = query.order_by(Repository.id.desc())
        else:
            query = query.order_by(Repository.id.asc())

        for repository in query:
            if check_exit(check):
                vprint(0, "Found .exit file. Exiting")
                return
            status.report()
            vprint(0, "Unzipping {}".format(repository))
            with mount_basedir():
                result = unzip_repository(session, repository)
                vprint(1, result)
            status.count += 1
            session.commit()
def apply(
    session, repository_id, status, script_name, execution_mode, with_execution, with_dependency,
    skip_if_error, skip_if_error_mode, skip_if_troublesome, try_to_discover_files,
    skip_env, skip_extract, dry_run, mode_rules, notebook_exec_mode,
    count, interval, reverse, check
):
    """Execute repositories"""
    mode_def = None if execution_mode == -1 else EXECUTION_MODE[execution_mode]

    filters = [
        Notebook.language == "python",
        Notebook.language_version != "unknown",
        func.length(Notebook.language_version) > 3,
        Repository.processed.op('&')(try_to_discover_files) == 0,
        Repository.processed.op('&')(consts.R_FAILED_TO_CLONE) == 0,
        Repository.processed.op('&')(skip_if_error) == 0,
        Repository.processed.op('&')(skip_if_troublesome) == 0,
        Repository.id == repository_id,
    ]

    if interval:
        filters += [
            Repository.id >= interval[0],
            Repository.id <= interval[-1]
        ]

    filters += EXECUTION_RULES[with_execution]
    filters += DEPENDENCY_RULES[with_dependency]

    if mode_def is None:
        filters += mode_rules(
            with_execution, with_dependency, skip_if_error_mode
        )
    else:
        filters.append(
            Notebook.processed.op('&')(
                mode_def.processed * skip_if_error_mode
            ) == 0
        )

    query = (
        session.query(Notebook, Repository)
        .join(Repository)
        .filter(*filters)
    )
    if count:
        print(query.count())
        return

    if reverse:
        query = query.order_by(
            (Repository.setups_count + Repository.requirements_count
            + Repository.pipfile_locks_count + Repository.pipfiles_count) > 0,
            Notebook.language_version.asc(),
            Repository.id.desc()
        )
    else:
        query = query.order_by(
            (Repository.setups_count + Repository.requirements_count
            + Repository.pipfile_locks_count + Repository.pipfiles_count) > 0,
            Notebook.language_version.asc(),
            Repository.id.asc()
        )

    moment = datetime.now().strftime("%Y%m%dT%H%M%S")
    config.LOGS_DIR.mkdir(parents=True, exist_ok=True)
    outf = str(config.LOGS_DIR / ("sub-{}-{}.out".format(script_name, moment)))
    errf = str(config.LOGS_DIR / ("sub-{}-{}.err".format(script_name, moment)))

    with open(outf, "wb") as out, open(errf, "wb") as err:

        group = groupby(
            query, lambda x: (
                x[0].language_version[:3], notebook_exec_mode(mode_def, *x)
            )
        )
        last = None
        for (version, mode), query_iter in group:
            status.report()
            vnum = version_string_to_list(version)
            envs = config.VERSIONS if mode.anaconda else config.RAW_VERSIONS
            env = best_match(vnum, envs)
            group = groupby(
                query_iter,
                lambda x: (x[1])
            )
            for repository, notebook_iter in group:
                if check_exit(check):
                    vprint(0, "Found .exit file. Exiting")
                    return
                current = (env, repository) if mode.dependencies else env
                if last != current:
                    prepared = prepare_environment(
                        session, env, mode, version, notebook_iter,
                        mode_def, skip_env, notebook_exec_mode, dry_run, out, err
                    )
                    if not prepared:
                        continue
                last = None if mode.dependencies else current
                result = execute_repository(
                    status, session, repository, notebook_iter,
                    mode, env, skip_extract, notebook_exec_mode, dry_run, out, err
                )
                vprint(2, result)
                session.commit()
Example #9
0
def apply(session, status, dispatches, selected_notebooks, skip_if_error,
          skip_if_syntaxerror, skip_if_timeout, count, interval, reverse,
          check):
    """Extract code cell features"""
    while selected_notebooks:
        filters = [
            Cell.processed.op('&')(consts.C_PROCESS_OK) == 0,
            Cell.processed.op('&')(skip_if_error) == 0,
            Cell.processed.op('&')(skip_if_syntaxerror) == 0,
            Cell.processed.op('&')(skip_if_timeout) == 0,
            Cell.processed.op('&')(
                consts.C_UNKNOWN_VERSION) == 0,  # known version
            Cell.cell_type == 'code',
            Cell.python.is_(True),
        ]
        if selected_notebooks is not True:
            filters += [Cell.notebook_id.in_(selected_notebooks[:30])]
            selected_notebooks = selected_notebooks[30:]
        else:
            selected_notebooks = False
            if interval:
                filters += [
                    Cell.repository_id >= interval[0],
                    Cell.repository_id <= interval[1],
                ]

        query = (session.query(Cell).filter(*filters))

        if count:
            print(query.count())
            return

        if reverse:
            query = query.order_by(
                Cell.repository_id.desc(),
                Cell.notebook_id.asc(),
                Cell.index.asc(),
            )
        else:
            query = query.order_by(
                Cell.repository_id.asc(),
                Cell.notebook_id.asc(),
                Cell.index.asc(),
            )

        skip_repo = False
        repository_id = None
        repository = None
        archives = None

        skip_notebook = False
        notebook_id = None
        checker = None

        for cell in query:
            if check_exit(check):
                session.commit()
                vprint(0, 'Found .exit file. Exiting')
                return
            status.report()

            with mount_basedir():
                skip_repo, repository_id, repository, archives = load_repository(
                    session, cell, skip_repo, repository_id, repository,
                    archives)
                if skip_repo:
                    continue

                skip_repo, skip_notebook, notebook_id, archives, checker = load_notebook(
                    session, cell, dispatches, repository, skip_repo,
                    skip_notebook, notebook_id, archives, checker)
                if skip_repo or skip_notebook:
                    continue

                vprint(2, 'Processing cell: {}'.format(cell))
                result = process_code_cell(
                    session,
                    repository_id,
                    notebook_id,
                    cell,
                    checker,
                    skip_if_error,
                    skip_if_syntaxerror,
                    skip_if_timeout,
                )
                vprint(2, result)
            status.count += 1
        session.commit()