def download_top_packages( directory: Path, days: Days = 365, workers: int = 24, limit: slice = slice(None), ) -> Generator[Path, None, None]: directory.mkdir(exist_ok=True) if not (directory / "info.json").exists(): write_config(directory / "info.json", []) packages = get_top_packages(days)[limit] packages = filter_already_downloaded(directory, packages) caches = [] # FIX-ME(low): get rid of try/finally and make sure # all exceptions are suppresed in get_package try: # FIX-ME(low): use reiz.utilities.get_executor with ThreadPoolExecutor(max_workers=workers) as executor: bound_downloader = partial(get_package, directory=directory) for package, package_directory in executor.map( bound_downloader, packages): if package_directory is not None: caches.append(package) finally: write_config( directory / "info.json", read_config(directory / "info.json") + caches, ) logger.info("fetched %d projects", len(caches))
def collect_tests(queries=QUERIES_PATH, *, log_skip=True): for query in queries.glob("**/*.reizql"): test_case = TestItem.from_test_path(query) if test_case.skip: if log_skip: logger.info("%r skipped", test_case.name) continue yield test_case
def worker(query, api): try: matches = len(post_request(api, query)) except QueryError as exc: logger.error("Query %r failed with %s!", exc.query, exc.reason) return False else: logger.info("Query succeed with %d matches!", matches) return True
def run_query(reiz_ql, stats=False, limit=DEFAULT_LIMIT): tree = parse_query(reiz_ql) logger.info("ReizQL Tree: %r", tree) selection = compile_edgeql(tree) if stats: selection = EdgeQLSelect(EdgeQLCall("count", [selection])) else: selection.limit = limit if tree.positional: selection.selections.extend(( EdgeQLSelector("lineno"), EdgeQLSelector("col_offset"), EdgeQLSelector("end_lineno"), EdgeQLSelector("end_col_offset"), EdgeQLSelector("_module", [EdgeQLSelector("filename")]), )) elif tree.name == "Module": selection.selections.append(EdgeQLSelector("filename")) else: raise Exception(f"Unexpected root matcher: {tree.name}") query = as_edgeql(selection) logger.info("EdgeQL query: %r", query) results = [] with connect(**get_db_settings()) as conn: if stats: return conn.query_one(query) query_set = conn.query(query) for result in query_set: loc_data = {} if tree.positional: loc_data.update({ "filename": result._module.filename, "lineno": result.lineno, "col_offset": result.col_offset, "end_lineno": result.end_lineno, "end_col_offset": result.end_col_offset, }) elif tree.name == "Module": loc_data.update({"filename": result.filename}) try: source = fetch(**loc_data) except Exception: source = None results.append({ "source": source, "filename": loc_data["filename"], }) return results
def run_tests(allow_fail): fail = False with get_new_connection() as connection: for test_case in collect_tests(): try: test_case.execute(connection) except ExpectationFailed: logger.info("%r failed", test_case.name) except Exception: logger.exception("%r terminated", test_case.name) else: logger.info("%r succeed", test_case.name) continue if test_case.name not in allow_fail: fail = True return fail
def main(): parser = ArgumentParser() parser.add_argument("--workers", type=int, default=12) parser.add_argument("--api", default="https://api.tree.science") parser.add_argument("--iterations", type=int, default=100) options = parser.parse_args() logger.info( "Starting %d threads for running total of %d queries", options.workers, options.iterations, ) with ThreadPoolExecutor(max_workers=options.workers) as executor: results = [] dataset = random.choices(QUERIES, k=options.iterations) for result in executor.map(partial(worker, api=options.api), dataset): results.append(result) logger.info("Status: %d/%d", results.count(True), results.count(False))
def fetch(projects, checkout_directory, workers, force): with ProcessPoolExecutor(max_workers=workers) as executor: tasks = [ executor.submit( checkout_sampling_data, checkout_directory, project, force, ) for project in projects ] for task in futures.as_completed(tasks): if project := task.result(): logger.info( "%r has been checked at %s revision", project.name, project.git_revision, ) yield project
def insert(clean_dir, workers, **db_opts): cache = read_config(clean_dir / "info.json") random.shuffle(cache) connector = partial(connect, **db_opts) bound_inserter = partial(insert_project, connector) stats = [] sync_cache(connector) try: with get_executor(workers) as executor: for project_path, project_stats in executor.map( bound_inserter, map(clean_dir.joinpath, cache)): stats.append(project_stats) logger.info("%s inserted, stats: %r", project_path.name, project_stats) finally: total_stats = sum(stats) logger.info("total stats: %r", total_stats)
def get_pypi_dataset(data_file, workers=4, limit=500): response = json_request(PYPI_DATSET_URL) projects = [] with ThreadPoolExecutor(max_workers=workers) as executor: tasks = [ executor.submit(get_sampling_data, **package) for package in response["rows"] ] for task in futures.as_completed(tasks): if project := task.result(): logger.info("Adding %s to the dataset", project.name) projects.append(project) if len(projects) >= limit: break for task in tasks: if not task.done(): task.cancel()
def _execute_tasks(tasks, projects, create_tasks, global_ctx): global_stats = Statistics() while tasks: done, _ = futures.wait(tasks, return_when=futures.FIRST_COMPLETED) total_completed = len(done) for task in done: project, stats = tasks.pop(task), task.result() global_stats.update(stats) if stats[Insertion.INSERTED] == 0: projects.remove(project) logger.info("%s: %r", project.name, stats) if global_ctx.apply_constraints(global_stats): for task in tasks: task.cancel() break projects.rotate(total_completed) tasks.update(create_tasks(total_completed, tasks.values())) return global_stats
def clean(dirty_dir: Path, clean_dir: Path, workers: int) -> None: cache = read_config(clean_dir / "info.json") projects = read_config(dirty_dir / "info.json") project_paths = {} for directory in dirty_dir.iterdir(): project_name, *version = directory.name.rsplit("-", 1) if directory.is_file() or project_name not in projects: continue project_paths[project_name] = directory results = [] # FIX-ME(low): use reiz.utilities.get_executor with ProcessPoolExecutor(max_workers=workers) as executor: bound_extractor = partial(extract, clean_dir=clean_dir) for project_name, destination_dir in executor.map( bound_extractor, filter( lambda item: item[0] not in cache, project_paths.items(), ), ): if destination_dir is None: logger.debug("extraction failed for project %r", project_name) else: logger.debug( "project %r successfully extracted to %s", project_name, destination_dir, ) results.append(project_name) cache.extend(results) logger.info( "cleaned %d packages (all-time: %d/%d)", len(results), len(cache), len(projects), ) write_config(clean_dir / "info.json", cache)
def insert_project(connector, directory): inserted, cached, failed = 0, 0, 0 with connector() as connection: for file in directory.glob("**/*.py"): filename = str(file) if filename in FILE_CACHE: cached += 1 continue try: insert_file(connection, file) except ArithmeticError: failed += 1 logger.info( "%s couldn't inserted due to an edgedb related failure", file, ) except Exception: failed += 1 logger.exception("%s couldn't inserted", file) else: inserted += 1 logger.info("%s successfully inserted", file) return directory, Stats(cached=cached, failed=failed, inserted=inserted)
for task in futures.as_completed(tasks): if project := task.result(): logger.info("Adding %s to the dataset", project.name) projects.append(project) if len(projects) >= limit: break for task in tasks: if not task.done(): task.cancel() logger.info( "%d repositories have been added to the %s", len(projects), str(data_file), ) dump_dataset(data_file, projects) def main(): parser = ArgumentParser() parser.add_argument("data_file", type=Path) parser.add_argument("--workers", type=int, default=4) parser.add_argument("--limit", type=int, default=500) options = parser.parse_args() get_pypi_dataset(**vars(options)) if __name__ == "__main__":
def drop_and_load_db(schema, reboot_server=True): if reboot_server: drop_all_connection(config.database.cluster) logger.info("Successfully rebooted...") with get_new_connection(database="edgedb") as connection: with suppress(InvalidReferenceError): connection.execute(f"DROP DATABASE {config.database.database}") logger.info("Creating the database %s...", config.database.database) connection.execute(f"CREATE DATABASE {config.database.database}") logger.info("Database created...") with get_new_connection() as connection: with open(schema) as stream: content = stream.read() logger.info("Executing schema on %s...", connection.dbname) connection.execute(content) logger.info("Starting migration...") connection.execute("POPULATE MIGRATION") logger.info("Committing the schema...") connection.execute("COMMIT MIGRATION") logger.info("Successfully resetted!")
def drop_all_connection(cluster): logger.info("Stopping the server...") subprocess.run(SERVER_MANAGER + ["stop", cluster]) logger.info("Re-starting the server...") subprocess.check_call(SERVER_MANAGER + ["start", cluster])
def create_db(): if does_db_exist(): logger.info("database exits, doing nothing...") else: subprocess.check_call(["/bin/bash", "scripts/regen_db.sh"], cwd=SCRIPTS_DIR.parent)
update_filter = IR.filter( IR.attribute(None, "id"), IR.call("array_unpack", [IR.cast("array<uuid>", IR.variable("ids"))]), "IN", ) for base_type in Schema.module_annotated_types: update = IR.update( base_type.kind_name, filters=update_filter, assignments={"_module": module_select}, ) context.connection.query(IR.construct(update), ids=context.reference_pool) logger.info("%r has been inserted successfully", context.filename) context.cache() return Insertion.INSERTED def insert_project(project, *, global_ctx): with global_ctx.pool.new_connection() as connection: project_ctx = global_ctx.new_child(project, connection) if not project_ctx.is_cached(): apply_ast(project_ctx.as_ast(), project_ctx) project_ctx.cache() stats = Statistics() for file in project_ctx.path.glob("**/*.py"): if project_ctx.apply_constraints(stats): break
def expect(self, message, left, right, truth): if not truth: logger.info("(%s) %s: (left=%s, right=%s)", self.name, message, left, right) raise ExpectationFailed