Example #1
0
def download_top_packages(
        directory: Path,
        days: Days = 365,
        workers: int = 24,
        limit: slice = slice(None),
) -> Generator[Path, None, None]:
    directory.mkdir(exist_ok=True)
    if not (directory / "info.json").exists():
        write_config(directory / "info.json", [])

    packages = get_top_packages(days)[limit]
    packages = filter_already_downloaded(directory, packages)
    caches = []
    # FIX-ME(low): get rid of try/finally and make sure
    # all exceptions are suppresed in get_package
    try:
        # FIX-ME(low): use reiz.utilities.get_executor
        with ThreadPoolExecutor(max_workers=workers) as executor:
            bound_downloader = partial(get_package, directory=directory)
            for package, package_directory in executor.map(
                    bound_downloader, packages):
                if package_directory is not None:
                    caches.append(package)
    finally:
        write_config(
            directory / "info.json",
            read_config(directory / "info.json") + caches,
        )
    logger.info("fetched %d projects", len(caches))
Example #2
0
def collect_tests(queries=QUERIES_PATH, *, log_skip=True):
    for query in queries.glob("**/*.reizql"):
        test_case = TestItem.from_test_path(query)
        if test_case.skip:
            if log_skip:
                logger.info("%r skipped", test_case.name)
            continue
        yield test_case
Example #3
0
def worker(query, api):
    try:
        matches = len(post_request(api, query))
    except QueryError as exc:
        logger.error("Query %r failed with %s!", exc.query, exc.reason)
        return False
    else:
        logger.info("Query succeed with %d matches!", matches)
        return True
Example #4
0
def run_query(reiz_ql, stats=False, limit=DEFAULT_LIMIT):
    tree = parse_query(reiz_ql)
    logger.info("ReizQL Tree: %r", tree)

    selection = compile_edgeql(tree)
    if stats:
        selection = EdgeQLSelect(EdgeQLCall("count", [selection]))
    else:
        selection.limit = limit
        if tree.positional:
            selection.selections.extend((
                EdgeQLSelector("lineno"),
                EdgeQLSelector("col_offset"),
                EdgeQLSelector("end_lineno"),
                EdgeQLSelector("end_col_offset"),
                EdgeQLSelector("_module", [EdgeQLSelector("filename")]),
            ))
        elif tree.name == "Module":
            selection.selections.append(EdgeQLSelector("filename"))
        else:
            raise Exception(f"Unexpected root matcher: {tree.name}")

    query = as_edgeql(selection)
    logger.info("EdgeQL query: %r", query)

    results = []
    with connect(**get_db_settings()) as conn:
        if stats:
            return conn.query_one(query)

        query_set = conn.query(query)

        for result in query_set:
            loc_data = {}
            if tree.positional:
                loc_data.update({
                    "filename": result._module.filename,
                    "lineno": result.lineno,
                    "col_offset": result.col_offset,
                    "end_lineno": result.end_lineno,
                    "end_col_offset": result.end_col_offset,
                })
            elif tree.name == "Module":
                loc_data.update({"filename": result.filename})

            try:
                source = fetch(**loc_data)
            except Exception:
                source = None

            results.append({
                "source": source,
                "filename": loc_data["filename"],
            })

    return results
Example #5
0
def run_tests(allow_fail):
    fail = False
    with get_new_connection() as connection:
        for test_case in collect_tests():
            try:
                test_case.execute(connection)
            except ExpectationFailed:
                logger.info("%r failed", test_case.name)
            except Exception:
                logger.exception("%r terminated", test_case.name)
            else:
                logger.info("%r succeed", test_case.name)
                continue

            if test_case.name not in allow_fail:
                fail = True
    return fail
Example #6
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--workers", type=int, default=12)
    parser.add_argument("--api", default="https://api.tree.science")
    parser.add_argument("--iterations", type=int, default=100)
    options = parser.parse_args()
    logger.info(
        "Starting %d threads for running total of %d queries",
        options.workers,
        options.iterations,
    )
    with ThreadPoolExecutor(max_workers=options.workers) as executor:
        results = []
        dataset = random.choices(QUERIES, k=options.iterations)
        for result in executor.map(partial(worker, api=options.api), dataset):
            results.append(result)
            logger.info("Status: %d/%d", results.count(True),
                        results.count(False))
Example #7
0
def fetch(projects, checkout_directory, workers, force):
    with ProcessPoolExecutor(max_workers=workers) as executor:
        tasks = [
            executor.submit(
                checkout_sampling_data,
                checkout_directory,
                project,
                force,
            ) for project in projects
        ]
        for task in futures.as_completed(tasks):
            if project := task.result():
                logger.info(
                    "%r has been checked at %s revision",
                    project.name,
                    project.git_revision,
                )
                yield project
Example #8
0
def insert(clean_dir, workers, **db_opts):
    cache = read_config(clean_dir / "info.json")
    random.shuffle(cache)
    connector = partial(connect, **db_opts)
    bound_inserter = partial(insert_project, connector)

    stats = []
    sync_cache(connector)
    try:
        with get_executor(workers) as executor:
            for project_path, project_stats in executor.map(
                    bound_inserter, map(clean_dir.joinpath, cache)):
                stats.append(project_stats)
                logger.info("%s inserted, stats: %r", project_path.name,
                            project_stats)
    finally:
        total_stats = sum(stats)
        logger.info("total stats: %r", total_stats)
Example #9
0
def get_pypi_dataset(data_file, workers=4, limit=500):
    response = json_request(PYPI_DATSET_URL)
    projects = []

    with ThreadPoolExecutor(max_workers=workers) as executor:
        tasks = [
            executor.submit(get_sampling_data, **package)
            for package in response["rows"]
        ]

        for task in futures.as_completed(tasks):
            if project := task.result():
                logger.info("Adding %s to the dataset", project.name)
                projects.append(project)

            if len(projects) >= limit:
                break

        for task in tasks:
            if not task.done():
                task.cancel()
Example #10
0
def _execute_tasks(tasks, projects, create_tasks, global_ctx):
    global_stats = Statistics()
    while tasks:
        done, _ = futures.wait(tasks, return_when=futures.FIRST_COMPLETED)
        total_completed = len(done)

        for task in done:
            project, stats = tasks.pop(task), task.result()
            global_stats.update(stats)
            if stats[Insertion.INSERTED] == 0:
                projects.remove(project)
            logger.info("%s: %r", project.name, stats)

        if global_ctx.apply_constraints(global_stats):
            for task in tasks:
                task.cancel()
            break

        projects.rotate(total_completed)
        tasks.update(create_tasks(total_completed, tasks.values()))
    return global_stats
Example #11
0
def clean(dirty_dir: Path, clean_dir: Path, workers: int) -> None:
    cache = read_config(clean_dir / "info.json")
    projects = read_config(dirty_dir / "info.json")
    project_paths = {}
    for directory in dirty_dir.iterdir():
        project_name, *version = directory.name.rsplit("-", 1)
        if directory.is_file() or project_name not in projects:
            continue

        project_paths[project_name] = directory

    results = []
    # FIX-ME(low): use reiz.utilities.get_executor
    with ProcessPoolExecutor(max_workers=workers) as executor:
        bound_extractor = partial(extract, clean_dir=clean_dir)
        for project_name, destination_dir in executor.map(
                bound_extractor,
                filter(
                    lambda item: item[0] not in cache,
                    project_paths.items(),
                ),
        ):
            if destination_dir is None:
                logger.debug("extraction failed for project %r", project_name)
            else:
                logger.debug(
                    "project %r successfully extracted to %s",
                    project_name,
                    destination_dir,
                )
                results.append(project_name)
    cache.extend(results)
    logger.info(
        "cleaned %d packages (all-time: %d/%d)",
        len(results),
        len(cache),
        len(projects),
    )
    write_config(clean_dir / "info.json", cache)
Example #12
0
def insert_project(connector, directory):
    inserted, cached, failed = 0, 0, 0
    with connector() as connection:
        for file in directory.glob("**/*.py"):
            filename = str(file)
            if filename in FILE_CACHE:
                cached += 1
                continue

            try:
                insert_file(connection, file)
            except ArithmeticError:
                failed += 1
                logger.info(
                    "%s couldn't inserted due to an edgedb related failure",
                    file,
                )
            except Exception:
                failed += 1
                logger.exception("%s couldn't inserted", file)
            else:
                inserted += 1
                logger.info("%s successfully inserted", file)
    return directory, Stats(cached=cached, failed=failed, inserted=inserted)
Example #13
0
        for task in futures.as_completed(tasks):
            if project := task.result():
                logger.info("Adding %s to the dataset", project.name)
                projects.append(project)

            if len(projects) >= limit:
                break

        for task in tasks:
            if not task.done():
                task.cancel()

    logger.info(
        "%d repositories have been added to the %s",
        len(projects),
        str(data_file),
    )
    dump_dataset(data_file, projects)


def main():
    parser = ArgumentParser()
    parser.add_argument("data_file", type=Path)
    parser.add_argument("--workers", type=int, default=4)
    parser.add_argument("--limit", type=int, default=500)
    options = parser.parse_args()
    get_pypi_dataset(**vars(options))


if __name__ == "__main__":
Example #14
0
def drop_and_load_db(schema, reboot_server=True):
    if reboot_server:
        drop_all_connection(config.database.cluster)
        logger.info("Successfully rebooted...")

    with get_new_connection(database="edgedb") as connection:
        with suppress(InvalidReferenceError):
            connection.execute(f"DROP DATABASE {config.database.database}")
        logger.info("Creating the database %s...", config.database.database)
        connection.execute(f"CREATE DATABASE {config.database.database}")
        logger.info("Database created...")

    with get_new_connection() as connection:
        with open(schema) as stream:
            content = stream.read()

        logger.info("Executing schema on %s...", connection.dbname)
        connection.execute(content)
        logger.info("Starting migration...")
        connection.execute("POPULATE MIGRATION")
        logger.info("Committing the schema...")
        connection.execute("COMMIT MIGRATION")

    logger.info("Successfully resetted!")
Example #15
0
def drop_all_connection(cluster):
    logger.info("Stopping the server...")
    subprocess.run(SERVER_MANAGER + ["stop", cluster])
    logger.info("Re-starting the server...")
    subprocess.check_call(SERVER_MANAGER + ["start", cluster])
Example #16
0
def create_db():
    if does_db_exist():
        logger.info("database exits, doing nothing...")
    else:
        subprocess.check_call(["/bin/bash", "scripts/regen_db.sh"],
                              cwd=SCRIPTS_DIR.parent)
Example #17
0
        update_filter = IR.filter(
            IR.attribute(None, "id"),
            IR.call("array_unpack",
                    [IR.cast("array<uuid>", IR.variable("ids"))]),
            "IN",
        )
        for base_type in Schema.module_annotated_types:
            update = IR.update(
                base_type.kind_name,
                filters=update_filter,
                assignments={"_module": module_select},
            )
            context.connection.query(IR.construct(update),
                                     ids=context.reference_pool)

    logger.info("%r has been inserted successfully", context.filename)
    context.cache()
    return Insertion.INSERTED


def insert_project(project, *, global_ctx):
    with global_ctx.pool.new_connection() as connection:
        project_ctx = global_ctx.new_child(project, connection)
        if not project_ctx.is_cached():
            apply_ast(project_ctx.as_ast(), project_ctx)
            project_ctx.cache()

        stats = Statistics()
        for file in project_ctx.path.glob("**/*.py"):
            if project_ctx.apply_constraints(stats):
                break
Example #18
0
 def expect(self, message, left, right, truth):
     if not truth:
         logger.info("(%s) %s: (left=%s, right=%s)", self.name, message,
                     left, right)
         raise ExpectationFailed