def apply(session, status, skip_if_error, count, interval, reverse, check): """Extract markdown features""" filters = [ Cell.processed.op('&')(consts.C_PROCESS_OK) == 0, Cell.processed.op('&')(skip_if_error) == 0, Cell.cell_type == 'markdown', ] if interval: filters += [ Cell.repository_id >= interval[0], Cell.repository_id <= interval[1], ] query = ( session.query(Cell) .filter(*filters) ) if count: print(query.count()) return if reverse: query = query.order_by( Cell.repository_id.desc(), Cell.notebook_id.asc(), Cell.index.asc(), ) else: query = query.order_by( Cell.repository_id.asc(), Cell.notebook_id.asc(), Cell.index.asc(), ) repository_id = None notebook_id = None for cell in query: if check_exit(check): vprint(0, 'Found .exit file. Exiting') return status.report() if repository_id != cell.repository_id: session.commit() repository_id = cell.repository_id vprint(0, 'Processing repository: {}'.format(repository_id)) if notebook_id != cell.notebook_id: notebook_id = cell.notebook_id vprint(1, 'Processing notebook: {}'.format(notebook_id)) vprint(2, 'Processing cell: {}/[{}]'.format(cell.id, cell.index)) result = process_markdown_cell( session, repository_id, notebook_id, cell, skip_if_error ) vprint(2, result) status.count += 1 session.commit()
def apply(session, status, keep, count, interval, reverse, check): """Compress repositories""" filters = [ Repository.processed.op('&')(consts.R_COMPRESS_OK) == 0, ] if interval: filters += [ Repository.id >= interval[0], Repository.id <= interval[1], ] query = session.query(Repository).filter(*filters) if count: print(query.count()) return if reverse: query = query.order_by(Repository.id.desc()) else: query = query.order_by(Repository.id.asc()) for repository in query: if check_exit(check): vprint(0, "Found .exit file. Exiting") return status.report() vprint(0, "Compressing {}".format(repository)) vprint(1, "Into {}".format(repository.zip_path)) with mount_basedir(): try: if repository.path.exists(): commit = repository.get_commit() if commit != repository.commit: repository.processed |= consts.R_COMMIT_MISMATCH repository.processed |= consts.R_COMPRESS_ERROR if repository.zip_path.exists() or repository.compress(): if repository.processed & consts.R_COMPRESS_ERROR: repository.processed -= consts.R_COMPRESS_ERROR if not keep: shutil.rmtree(str(repository.path), ignore_errors=True) elif not repository.zip_path.exists(): if repository.processed & consts.R_COMPRESS_ERROR: repository.processed -= consts.R_COMPRESS_ERROR if not repository.path.exists(): repository.processed |= consts.R_UNAVAILABLE_FILES vprint(1, "failed") if repository.zip_path.exists(): vprint(1, "ok") repository.processed |= consts.R_COMPRESS_OK except Exception as err: vprint(1, "Failed: {}".format(err)) session.add(repository) status.count += 1 session.commit()
def apply( session, status, skip_if_error, count, interval, reverse, check ): """Extract code cell features""" filters = [ Notebook.processed.op("&")(consts.N_AGGREGATE_OK) == 0, Notebook.processed.op("&")(skip_if_error) == 0, Notebook.processed.op("&")(consts.N_GENERIC_LOAD_ERROR) == 0, ] if interval: filters += [ Notebook.repository_id >= interval[0], Notebook.repository_id <= interval[1], ] query = ( session.query(Notebook) .filter(*filters) ) if count: print(query.count()) return if reverse: query = query.order_by( Notebook.repository_id.desc(), Notebook.id.desc(), ) else: query = query.order_by( Notebook.repository_id.asc(), Notebook.id.asc(), ) repository_id = None for notebook in query: if check_exit(check): session.commit() vprint(0, 'Found .exit file. Exiting') return status.report() repository_id = load_repository(session, notebook, repository_id) vprint(1, 'Processing notebook: {}'.format(notebook)) result = process_notebook(session, notebook, skip_if_error) vprint(1, result) status.count += 1 session.commit()
def apply( session, status, skip_if_error, count, interval, reverse, check ): """Extract code cell features""" filters = [ Repository.processed.op('&')(consts.R_EXTRACTED_FILES) == 0, Repository.processed.op('&')(skip_if_error) == 0, Repository.processed.op('&')(consts.R_COMPRESS_OK) != 0, # Compressed ] if interval: filters += [ Repository.id >= interval[0], Repository.id <= interval[1], ] query = ( session.query(Repository) .filter(*filters) ) if count: print(query.count()) return if reverse: query = query.order_by( Repository.id.desc(), ) else: query = query.order_by( Repository.id.asc(), ) for repository in query: if check_exit(check): session.commit() vprint(0, 'Found .exit file. Exiting') return status.report() vprint(0, 'Processing repository: {}'.format(repository)) with mount_basedir(): result = process_repository( session, repository, skip_if_error, ) vprint(1, result) status.count += 1 session.commit()
def apply(session, status, count, interval, reverse, check): """Extract code cell features""" filters = [CellModule.local_possibility.is_(None)] if interval: filters += [ CellModule.repository_id >= interval[0], CellModule.repository_id <= interval[1], ] query = (session.query(CellModule).filter(*filters)) if count: print(query.count()) return if reverse: query = query.order_by( CellModule.repository_id.desc(), CellModule.id.desc(), ) else: query = query.order_by( CellModule.repository_id.asc(), CellModule.id.asc(), ) skip_repo = False repository_id = None archives = None for cell_module in query: if check_exit(check): session.commit() vprint(0, 'Found .exit file. Exiting') return status.report() skip_repo, repository_id, archives = load_repository( session, cell_module, skip_repo, repository_id, archives) if skip_repo: continue vprint(1, 'Processing module: {}'.format(cell_module)) result = process_cell_module(session, cell_module, archives) vprint(1, result) status.count += 1 session.commit()
def apply( session, status, selected_repositories, skip_if_error, count, interval, reverse, check ): while selected_repositories: filters = [ Repository.processed.op("&")(consts.R_N_EXTRACTION) == 0, # no extraction Repository.processed.op("&")(skip_if_error) == 0, # no failure ] if selected_repositories is not True: filters += [ Repository.id.in_(selected_repositories[:30]) ] selected_repositories = selected_repositories[30:] else: selected_repositories = False if interval: filters += [ Repository.id >= interval[0], Repository.id <= interval[1], ] query = session.query(Repository).filter(*filters) if count: print(query.count()) return if reverse: query = query.order_by( Repository.id.desc() ) else: query = query.order_by( Repository.id.asc() ) for repository in query: if check_exit(check): vprint(0, "Found .exit file. Exiting") return status.report() vprint(0, "Extracting notebooks/cells from {}".format(repository)) with mount_basedir(): result = process_repository(session, repository, skip_if_error) vprint(0, result) status.count += 1 session.commit()
def apply(session, status, selected_repositories, processed, no, count, interval, reverse, check): while selected_repositories: filters = [ Repository.processed.op("&")(processed) == processed, # no extraction Repository.processed.op("&")(no) == 0, # no failure ] if selected_repositories is not True: filters += [Repository.id.in_(selected_repositories[:30])] selected_repositories = selected_repositories[30:] else: selected_repositories = False if interval: filters += [ Repository.id >= interval[0], Repository.id <= interval[1], ] query = session.query(Repository).filter(*filters) if count: print(query.count()) return if reverse: query = query.order_by(Repository.id.desc()) else: query = query.order_by(Repository.id.asc()) for repository in query: if check_exit(check): vprint(0, "Found .exit file. Exiting") return status.report() vprint(0, "Unzipping {}".format(repository)) with mount_basedir(): result = unzip_repository(session, repository) vprint(1, result) status.count += 1 session.commit()
def apply( session, repository_id, status, script_name, execution_mode, with_execution, with_dependency, skip_if_error, skip_if_error_mode, skip_if_troublesome, try_to_discover_files, skip_env, skip_extract, dry_run, mode_rules, notebook_exec_mode, count, interval, reverse, check ): """Execute repositories""" mode_def = None if execution_mode == -1 else EXECUTION_MODE[execution_mode] filters = [ Notebook.language == "python", Notebook.language_version != "unknown", func.length(Notebook.language_version) > 3, Repository.processed.op('&')(try_to_discover_files) == 0, Repository.processed.op('&')(consts.R_FAILED_TO_CLONE) == 0, Repository.processed.op('&')(skip_if_error) == 0, Repository.processed.op('&')(skip_if_troublesome) == 0, Repository.id == repository_id, ] if interval: filters += [ Repository.id >= interval[0], Repository.id <= interval[-1] ] filters += EXECUTION_RULES[with_execution] filters += DEPENDENCY_RULES[with_dependency] if mode_def is None: filters += mode_rules( with_execution, with_dependency, skip_if_error_mode ) else: filters.append( Notebook.processed.op('&')( mode_def.processed * skip_if_error_mode ) == 0 ) query = ( session.query(Notebook, Repository) .join(Repository) .filter(*filters) ) if count: print(query.count()) return if reverse: query = query.order_by( (Repository.setups_count + Repository.requirements_count + Repository.pipfile_locks_count + Repository.pipfiles_count) > 0, Notebook.language_version.asc(), Repository.id.desc() ) else: query = query.order_by( (Repository.setups_count + Repository.requirements_count + Repository.pipfile_locks_count + Repository.pipfiles_count) > 0, Notebook.language_version.asc(), Repository.id.asc() ) moment = datetime.now().strftime("%Y%m%dT%H%M%S") config.LOGS_DIR.mkdir(parents=True, exist_ok=True) outf = str(config.LOGS_DIR / ("sub-{}-{}.out".format(script_name, moment))) errf = str(config.LOGS_DIR / ("sub-{}-{}.err".format(script_name, moment))) with open(outf, "wb") as out, open(errf, "wb") as err: group = groupby( query, lambda x: ( x[0].language_version[:3], notebook_exec_mode(mode_def, *x) ) ) last = None for (version, mode), query_iter in group: status.report() vnum = version_string_to_list(version) envs = config.VERSIONS if mode.anaconda else config.RAW_VERSIONS env = best_match(vnum, envs) group = groupby( query_iter, lambda x: (x[1]) ) for repository, notebook_iter in group: if check_exit(check): vprint(0, "Found .exit file. Exiting") return current = (env, repository) if mode.dependencies else env if last != current: prepared = prepare_environment( session, env, mode, version, notebook_iter, mode_def, skip_env, notebook_exec_mode, dry_run, out, err ) if not prepared: continue last = None if mode.dependencies else current result = execute_repository( status, session, repository, notebook_iter, mode, env, skip_extract, notebook_exec_mode, dry_run, out, err ) vprint(2, result) session.commit()
def apply(session, status, dispatches, selected_notebooks, skip_if_error, skip_if_syntaxerror, skip_if_timeout, count, interval, reverse, check): """Extract code cell features""" while selected_notebooks: filters = [ Cell.processed.op('&')(consts.C_PROCESS_OK) == 0, Cell.processed.op('&')(skip_if_error) == 0, Cell.processed.op('&')(skip_if_syntaxerror) == 0, Cell.processed.op('&')(skip_if_timeout) == 0, Cell.processed.op('&')( consts.C_UNKNOWN_VERSION) == 0, # known version Cell.cell_type == 'code', Cell.python.is_(True), ] if selected_notebooks is not True: filters += [Cell.notebook_id.in_(selected_notebooks[:30])] selected_notebooks = selected_notebooks[30:] else: selected_notebooks = False if interval: filters += [ Cell.repository_id >= interval[0], Cell.repository_id <= interval[1], ] query = (session.query(Cell).filter(*filters)) if count: print(query.count()) return if reverse: query = query.order_by( Cell.repository_id.desc(), Cell.notebook_id.asc(), Cell.index.asc(), ) else: query = query.order_by( Cell.repository_id.asc(), Cell.notebook_id.asc(), Cell.index.asc(), ) skip_repo = False repository_id = None repository = None archives = None skip_notebook = False notebook_id = None checker = None for cell in query: if check_exit(check): session.commit() vprint(0, 'Found .exit file. Exiting') return status.report() with mount_basedir(): skip_repo, repository_id, repository, archives = load_repository( session, cell, skip_repo, repository_id, repository, archives) if skip_repo: continue skip_repo, skip_notebook, notebook_id, archives, checker = load_notebook( session, cell, dispatches, repository, skip_repo, skip_notebook, notebook_id, archives, checker) if skip_repo or skip_notebook: continue vprint(2, 'Processing cell: {}'.format(cell)) result = process_code_cell( session, repository_id, notebook_id, cell, checker, skip_if_error, skip_if_syntaxerror, skip_if_timeout, ) vprint(2, result) status.count += 1 session.commit()