Example #1
0
def apply(session, status, keep, count, interval, reverse, check):
    """Compress repositories"""
    filters = [
        Repository.processed.op('&')(consts.R_COMPRESS_OK) == 0,
    ]
    if interval:
        filters += [
            Repository.id >= interval[0],
            Repository.id <= interval[1],
        ]
    query = session.query(Repository).filter(*filters)
    if count:
        print(query.count())
        return

    if reverse:
        query = query.order_by(Repository.id.desc())
    else:
        query = query.order_by(Repository.id.asc())

    for repository in query:
        if check_exit(check):
            vprint(0, "Found .exit file. Exiting")
            return
        status.report()
        vprint(0, "Compressing {}".format(repository))
        vprint(1, "Into {}".format(repository.zip_path))
        with mount_basedir():
            try:
                if repository.path.exists():
                    commit = repository.get_commit()
                    if commit != repository.commit:
                        repository.processed |= consts.R_COMMIT_MISMATCH

                repository.processed |= consts.R_COMPRESS_ERROR
                if repository.zip_path.exists() or repository.compress():
                    if repository.processed & consts.R_COMPRESS_ERROR:
                        repository.processed -= consts.R_COMPRESS_ERROR
                    if not keep:
                        shutil.rmtree(str(repository.path), ignore_errors=True)
                elif not repository.zip_path.exists():
                    if repository.processed & consts.R_COMPRESS_ERROR:
                        repository.processed -= consts.R_COMPRESS_ERROR
                    if not repository.path.exists():
                        repository.processed |= consts.R_UNAVAILABLE_FILES
                    vprint(1, "failed")
                if repository.zip_path.exists():
                    vprint(1, "ok")
                    repository.processed |= consts.R_COMPRESS_OK
            except Exception as err:
                vprint(1, "Failed: {}".format(err))
        session.add(repository)
        status.count += 1
        session.commit()
Example #2
0
def apply(
    session, status,
    skip_if_error,
    count, interval, reverse, check
):
    """Extract code cell features"""
    filters = [
        Repository.processed.op('&')(consts.R_EXTRACTED_FILES) == 0,
        Repository.processed.op('&')(skip_if_error) == 0,
        Repository.processed.op('&')(consts.R_COMPRESS_OK) != 0,  # Compressed
    ]
    if interval:
        filters += [
            Repository.id >= interval[0],
            Repository.id <= interval[1],
        ]

    query = (
        session.query(Repository)
        .filter(*filters)
    )

    if count:
        print(query.count())
        return

    if reverse:
        query = query.order_by(
            Repository.id.desc(),
        )
    else:
        query = query.order_by(
            Repository.id.asc(),
        )

    for repository in query:
        if check_exit(check):
            session.commit()
            vprint(0, 'Found .exit file. Exiting')
            return
        status.report()

        vprint(0, 'Processing repository: {}'.format(repository))
        with mount_basedir():
            result = process_repository(
                session, repository, skip_if_error,
            )
        vprint(1, result)
        status.count += 1
        session.commit()
Example #3
0
def github_crawler(github_url):
    status = None
    count = None
    interval = None
    reverse = None
    check = 'all'
    keep_uncompressed = 'False'
    dispatches = set()
    script_name = None
    skip_env = False
    skip_extract = 0
    dry_run = 0
    status = StatusLogger(script_name)
    status.report()

    with connect() as session, mount_basedir(), savepid():
        repository = load_repository.load_repository_from_url(
            session, github_url)
        s1_notebooks_and_cells.apply(
            SafeSession(session,
                        interrupted=consts.N_STOPPED), status, [repository.id]
            or True, consts.R_N_ERROR, count, interval, reverse, set(check))
        s2_requirement_files.apply(session, status, [repository.id] or True,
                                   consts.R_REQUIREMENTS_ERROR, count,
                                   interval, reverse, set(check))
        s3_compress.apply(session, status, keep_uncompressed, count, interval,
                          reverse, set(check))
        s4_markdown_features.apply(session, status, consts.C_PROCESS_ERROR,
                                   count, interval, reverse, set(check))
        s5_extract_files.apply(session, status, consts.R_COMPRESS_ERROR, count,
                               interval, reverse, set(check))
        s6_cell_features.apply(SafeSession(session), status, dispatches, True,
                               consts.C_PROCESS_ERROR, consts.C_SYNTAX_ERROR,
                               consts.C_TIMEOUT, count, interval, reverse,
                               set(check))
        result = s7_execute_repositories.apply(
            session, repository.id, status, script_name, config.EXECUTION_MODE,
            config.WITH_EXECUTION, config.WITH_DEPENDENCY,
            consts.R_COMPRESS_ERROR, 3, consts.R_TROUBLESOME,
            consts.R_UNAVAILABLE_FILES, skip_env, skip_extract, dry_run,
            mode_rules, s7_execute_repositories.notebook_exec_mode, count,
            interval, reverse, set(check))
        p0_local_possibility.apply(session, status, count, interval, reverse,
                                   set(check))
        p1_notebook_aggregate.apply(session, status, consts.N_AGGREGATE_ERROR,
                                    count, interval, reverse, set(check))
        p2_sha1_exercises.apply(session, status, count, interval, reverse,
                                set(check))
        return repository.id
Example #4
0
def apply(
    session, status, selected_repositories, skip_if_error,
    count, interval, reverse, check
):
    while selected_repositories:
        filters = [
            Repository.processed.op("&")(consts.R_N_EXTRACTION) == 0, # no extraction
            Repository.processed.op("&")(skip_if_error) == 0, # no failure
        ]
        if selected_repositories is not True:
            filters += [
                Repository.id.in_(selected_repositories[:30])
            ]
            selected_repositories = selected_repositories[30:]
        else:
            selected_repositories = False
            if interval:
                filters += [
                    Repository.id >= interval[0],
                    Repository.id <= interval[1],
                ]

        query = session.query(Repository).filter(*filters)
        if count:
            print(query.count())
            return

        if reverse:
            query = query.order_by(
                Repository.id.desc()
            )
        else:
            query = query.order_by(
                Repository.id.asc()
            )

        for repository in query:
            if check_exit(check):
                vprint(0, "Found .exit file. Exiting")
                return
            status.report()
            vprint(0, "Extracting notebooks/cells from {}".format(repository))
            with mount_basedir():
                result = process_repository(session, repository, skip_if_error)
                vprint(0, result)
            status.count += 1
            session.commit()
Example #5
0
def main():
    """Main function"""
    parser = argparse.ArgumentParser(description="Load Repository by URL")
    parser.add_argument("url", type=str,
                        help="repository URL")
    parser.add_argument("-v", "--verbose", type=int, default=config.VERBOSE,
                        help="increase output verbosity")
    parser.add_argument("-b", "--branch", type=str,
                        help="specific branch")
    parser.add_argument("-c", "--commit", type=str,
                        help="specific commit")
    parser.add_argument("-e", "--clone-existing", action='store_true',
                        help="clone even if repository exists")

    args = parser.parse_args()
    config.VERBOSE = args.verbose
    with connect() as session, mount_basedir(), savepid():
        load_repository_from_url(
            session, args.url, args.branch, args.commit, args.clone_existing
        )
def apply(session, status, selected_repositories, processed, no, count,
          interval, reverse, check):
    while selected_repositories:
        filters = [
            Repository.processed.op("&")(processed) ==
            processed,  # no extraction
            Repository.processed.op("&")(no) == 0,  # no failure
        ]
        if selected_repositories is not True:
            filters += [Repository.id.in_(selected_repositories[:30])]
            selected_repositories = selected_repositories[30:]
        else:
            selected_repositories = False
            if interval:
                filters += [
                    Repository.id >= interval[0],
                    Repository.id <= interval[1],
                ]

        query = session.query(Repository).filter(*filters)
        if count:
            print(query.count())
            return

        if reverse:
            query = query.order_by(Repository.id.desc())
        else:
            query = query.order_by(Repository.id.asc())

        for repository in query:
            if check_exit(check):
                vprint(0, "Found .exit file. Exiting")
                return
            status.report()
            vprint(0, "Unzipping {}".format(repository))
            with mount_basedir():
                result = unzip_repository(session, repository)
                vprint(1, result)
            status.count += 1
            session.commit()
Example #7
0
def get_notebook(repository_id, notebook_id):
    with connect() as session:
        nbconvert_rdf = ''
        name = ''
        filters = [Repository.id == repository_id]
        repository = session.query(Repository).filter(*filters).first()

        notebook_filters = [
            Notebook.id == notebook_id, Notebook.repository_id == repository_id
        ]
        notebook_query = session.query(Notebook).filter(
            *notebook_filters).first()
        name = notebook_query.name
        with mount_basedir():
            if repository.path.exists():
                execution_path = (config.EXECUTION_DIR / repository.hash_dir2)
                if os.path.exists(execution_path):
                    notebook_path = execution_path
                else:
                    notebook_path = repository.path

            try:
                with open(str(notebook_path / name)) as ofile:
                    notebook = ofile.read()
                    nbtordfconverter = nb2rdf.NBToRDFConverter()
                    notebook_json = nbformat.reads(notebook, as_version=4)
                    nbconvert_rdf = nbtordfconverter.convert_to_rdf(
                        name, notebook_json)
                    output_file_extension = 'ttl'
                    output_file = os.path.join(
                        repository.path, name + "." + output_file_extension)
                    open(output_file, 'w').write(str(nbconvert_rdf))
                    return str(nbconvert_rdf), name
            except OSError as e:
                vprint(3, "Failed to open notebook {}".format(e))
                return str(nbconvert_rdf), name
Example #8
0
def apply(session, status, dispatches, selected_notebooks, skip_if_error,
          skip_if_syntaxerror, skip_if_timeout, count, interval, reverse,
          check):
    """Extract code cell features"""
    while selected_notebooks:
        filters = [
            Cell.processed.op('&')(consts.C_PROCESS_OK) == 0,
            Cell.processed.op('&')(skip_if_error) == 0,
            Cell.processed.op('&')(skip_if_syntaxerror) == 0,
            Cell.processed.op('&')(skip_if_timeout) == 0,
            Cell.processed.op('&')(
                consts.C_UNKNOWN_VERSION) == 0,  # known version
            Cell.cell_type == 'code',
            Cell.python.is_(True),
        ]
        if selected_notebooks is not True:
            filters += [Cell.notebook_id.in_(selected_notebooks[:30])]
            selected_notebooks = selected_notebooks[30:]
        else:
            selected_notebooks = False
            if interval:
                filters += [
                    Cell.repository_id >= interval[0],
                    Cell.repository_id <= interval[1],
                ]

        query = (session.query(Cell).filter(*filters))

        if count:
            print(query.count())
            return

        if reverse:
            query = query.order_by(
                Cell.repository_id.desc(),
                Cell.notebook_id.asc(),
                Cell.index.asc(),
            )
        else:
            query = query.order_by(
                Cell.repository_id.asc(),
                Cell.notebook_id.asc(),
                Cell.index.asc(),
            )

        skip_repo = False
        repository_id = None
        repository = None
        archives = None

        skip_notebook = False
        notebook_id = None
        checker = None

        for cell in query:
            if check_exit(check):
                session.commit()
                vprint(0, 'Found .exit file. Exiting')
                return
            status.report()

            with mount_basedir():
                skip_repo, repository_id, repository, archives = load_repository(
                    session, cell, skip_repo, repository_id, repository,
                    archives)
                if skip_repo:
                    continue

                skip_repo, skip_notebook, notebook_id, archives, checker = load_notebook(
                    session, cell, dispatches, repository, skip_repo,
                    skip_notebook, notebook_id, archives, checker)
                if skip_repo or skip_notebook:
                    continue

                vprint(2, 'Processing cell: {}'.format(cell))
                result = process_code_cell(
                    session,
                    repository_id,
                    notebook_id,
                    cell,
                    checker,
                    skip_if_error,
                    skip_if_syntaxerror,
                    skip_if_timeout,
                )
                vprint(2, result)
            status.count += 1
        session.commit()