def setup(self): """Clone the repo, connect to the DB, create working directories, etc.""" self._connect_db() repo = self._get_git_repo() if self._current_commit is None: log.warn( f"Deleting and re-cloning repo in {self._local_repo_path}") try: shutil.rmtree(self._local_repo_path) repo = self._get_git_repo() except Exception as e: log.error(f"Failed to repair repository: {type(e)}: {e}") raise e # to _target_commit if set if self._target_commit and self._target_commit != self._current_commit_hash: log.info(f"Checking out commit {self._target_commit}...") try: commit = repo.get(self._target_commit) log.debug(f"target commit {commit}") # commit might not exist for a variety of reasons (need to fetch, DNE, corrupt, etc) repo.checkout_tree(commit.tree) repo.head.set_target(commit.id) except Exception as e: raise e log.info( f"Repo at {self._local_repo_path} now at {self._current_commit_hash}" ) elif self._target_commit and self._target_commit == self._current_commit_hash: log.debug( f"Repo in {self._local_repo_path} is already at {self._target_commit}" )
def _get_git_repo(self): repodir = self._local_repo_path if not (repodir / '.git').exists(): repodir.mkdir(mode=0o770, parents=True, exist_ok=True) log.info(f"cloning repo to {repodir} ...") return git.clone_repository(self.repo_url, repodir.resolve()) else: repopath = git.discover_repository(repodir.resolve()) return git.Repository(repopath)
def database_init(args): client = ArangoClient(hosts=strip_url(args.db)) p = urlparse(args.db) odb = client.db(p.path[1:], username=p.username, password=p.password) if args.delete: log.warn(f"deleting all data ...") # deleting old stuff could take awhile jobs = [] db = odb.begin_async_execution() jobs.append( db.delete_graph(tree_models._graph_name, ignore_missing=True)) for c in _db_collections: jobs.append(db.delete_collection(c, ignore_missing=True)) for c in _db_edgecollections: jobs.append(db.delete_collection(c, ignore_missing=True)) jt_wait = len(jobs) while len(jobs) > 0: time.sleep(1) for j in jobs: if j.status() == 'done': jobs.remove(j) if jt_wait != len(jobs): log.debug(f"delete: waiting on {len(jobs)} jobs to finish ...") jt_wait = len(jobs) # back to non-async db = odb log.info(f"Creating collections ...") colls = {} for cn in _db_collections: if db.has_collection(cn): colls[cn] = db.collection(cn) else: colls[cn] = db.create_collection(cn, user_keys=True) for cn in _db_edgecollections: if db.has_collection(cn): colls[cn] = db.collection(cn) else: colls[cn] = db.create_collection(cn, user_keys=True, edge=True) graph = None if not db.has_graph(tree_models._graph_name): graph = db.create_graph(tree_models._graph_name) else: graph = db.graph(tree_models._graph_name) edgedefs = {} for gk, gv in _graph_edge_definitions.items(): if not graph.has_edge_definition(gv['edge_collection']): log.debug(f"Added graph edges {gv}") edgedefs[gk] = graph.create_edge_definition(**gv)
def __main__(): parser = argparse.ArgumentParser() parser.add_argument("query", type=str, help="S-exp query to execute") parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_true") parser.add_argument("--node-text", help="Show the text content of the matched nodes", action="store_true") args = parser.parse_args() if args.verbose: log.setLevel(log.DEBUG) log.debug("Verbose logging enabled.") parsed_s_query = sexp.parseString(args.query) log.debug(parsed_s_query) r = find_nodes_by_query(parsed_s_query) rl = [] for n in r: log.info(f"{n} in {n.file.fetch()}") n_sexp = node_as_sexp(n, maxdepth=3, indent=2, show_start_coords=True) log.info(f"{n_sexp}") if args.node_text: log.info(f"{n.text.fetch()}") rl.append(n) log.info(f"{len(rl)} results returned")
def _tqdm_node_receiver(q, en_manager): """This is the cross-process aggregator for non-required data Even without this process the collection and analysis should run normally. It's mostly just used for debugging and informational output. """ try: log.debug(f"start counting db inserts...") n = 0 cache_stats = { "text_lfu_hit": 0, "text_lfu_miss": 0, } dedup_stats = {} cntr = en_manager.counter(desc="writing to db", position=1, unit='docs', autorefresh=True) # with tqdm(desc="writing documents to db", position=1, unit='docs', unit_scale=True) as tbar: while (nc := q.get()) is not None: if type(nc) == int: n += nc cntr.update(nc) elif nc[0] == "cache_stats": for k, v in nc[1].items(): cache_stats[k] += v elif nc[0] == "dedup_stats": if nc[1] not in dedup_stats: dedup_stats[nc[1]] = 0 dedup_stats[nc[1]] += nc[2] else: log.error( f"node receiver process got invalid data sent of type {type(nc)}" ) log.info(f"stopped counting nodes, total documents inserted: {n}") cache_text_lfu_ratio = cache_stats["text_lfu_hit"] / ( cache_stats["text_lfu_miss"] or 1) log.debug( f"text_lfu cache stats: ratio {cache_text_lfu_ratio}, hit {cache_stats['text_lfu_hit']}" ) return True
def slow_worker(name, orig_jobitems, itemtime, en_manager): """I WANT: A function I can call to retrieve a proxy for a counter """ jobitems = random.randrange(orig_jobitems // 2, orig_jobitems * 2) try: cntr = en_manager.counter(desc=f"job {name}", total=jobitems, leave=False) except Exception as e: log.err(f"{type(e)}: {e}") raise log.info(f"job {name} started") for _ in range(jobitems): time.sleep(random.uniform(itemtime * 0.1, itemtime * 1.5)) cntr.update() cntr.close() log.info(f"job {name} completed") return jobitems
def __main__(): parser = argparse.ArgumentParser() parser.add_argument("--nworkers", type=int, help="number of workers", default=os.cpu_count()) parser.add_argument("--jobitems", type=int, help="items in a single job", default=200) parser.add_argument("--njobs", type=int, help="number of total jobs to complete", default=200) parser.add_argument("--itemtime", type=float, help="time taken per item in a job", default=0.1) parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_true") args = parser.parse_args() if args.verbose: log.setLevel(log.DEBUG) log.debug("Verbose logging enabled.") with ProcessPool(max_workers=args.nworkers) as executor: multiprogress.main_proc_setup() multiprogress.start_server_thread() en_manager_proxy = multiprogress.get_manager_proxy() en_manager = multiprogress.get_manager() ret_futures = [] # log.debug(f"counter_generator: {repr(counter_generator)}") log.info(f"Starting jobs...") for i in range(args.njobs): ret_futures.append( executor.schedule( slow_worker, (i, args.jobitems, args.itemtime, en_manager_proxy))) log.info(f"Waiting for jobs to complete...") cntr_all_jobs = en_manager.counter(desc="all jobs", total=args.njobs, color='blue') log.debug(f"cntr_all_jobs: {repr(cntr_all_jobs)}") for f in futures.as_completed(ret_futures): f.result() log.debug(f"finished a job!") cntr_all_jobs.update() log.info(f"All jobs completed!")
def __main__(): parser = argparse.ArgumentParser() parser.add_argument("--db", "--database", type=str, help="Database connection string", default=os.environ.get( 'WST_DB_URI', "http://*****:*****@localhost:8529/wst")) parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_true") parser.set_defaults(en_manager=enlighten.get_manager()) subcmds = parser.add_subparsers(title="Collector commands") # analysis cmd_analyze = subcmds.add_parser('analyze', aliases=['add', 'a'], help="Analyze repositories") cmd_analyze.set_defaults(func=analyze) cmd_analyze.add_argument("repo_url", type=str, help="URI for cloning the repository") cmd_analyze.add_argument( "-w", "--workers", type=int, help= "Number of workers to use for processing files, default: os.cpu_count()", default=None) cmd_analyze.add_argument( "--skip-exists", "--skip-existing", action="store_true", help= "Skip the analysis if the repo document already exists in the database" ) cmd_analyze.add_argument( "--interactive-debug", action="store_true", help="Start the interactive debugger after repo setup") cmd_analyze.add_argument( "--overwrite-incomplete", action="store_true", help="Overwrite existing but incomplete / unfinished data in the DB") cmd_analyze.add_argument( "-t", "--target-commit", type=str, help="Checkout and analyze a specific commit from the repo", default=None) # batch analysis cmd_batch = subcmds.add_parser( 'batch', aliases=['addbatch', 'addmulti'], help="Analyze multiple repos from a JSON specification list") set_batch_analyze_args(cmd_batch) # delete data selectively cmd_delete = subcmds.add_parser('delete', aliases=['del'], help="Delete tree data selectively") cmd_delete.set_defaults(func=delete) cmd_delete.add_argument( "which_repo", type=str, help="URI or commit SHA for which repo's data to delete") # db setup cmd_db = subcmds.add_parser('db', aliases=['database'], help="Manage the database") subcmds_db = cmd_db.add_subparsers(title="Manage the database") cmd_db_init = subcmds_db.add_parser('initialize', aliases=['init', 'setup'], help="Set up the database") cmd_db_init.set_defaults(func=database_init) cmd_db_init.add_argument( "-d", "--delete", help="Delete any existing data in the database", action="store_true", ) args = parser.parse_args() if args.verbose: log.setLevel(log.DEBUG) log.debug("Verbose logging enabled.") log.info(f"DB connection: {desensitize_url(args.db)}") if 'func' not in args: log.warn(f"Please supply a valid subcommand!") return try: args.func(args) except KeyboardInterrupt as e: log.warn(f"Stopping all child processes...") cur_proc = psutil.Process() children = cur_proc.children(recursive=True) for c in children: os.kill(c.pid, signal.SIGINT) psutil.wait_procs(children, timeout=5) children = cur_proc.children(recursive=True) for c in children: c.terminate() raise e
_key="wst0test0461b1c841f897cbd952354370471a64", type='test', url=f"wst.tests.insertion/{test_id}", commit="wst0test0461b1c841f897cbd952354370471a64", path=f"wst/tests/{test_id}", ) repo.insert_in_db(db) file = WSTFile( _key="wst0test0461b1c841f897cbd952354370471a64-0", oid="testwst0", path=args.file_path, language=args.language, ) with Manager() as _mp_manager: _node_queue = _mp_manager.Queue() node_receiver = wst_arango_worker._tqdm_node_receiver(_node_queue) try: r = cProfile.run( f'wst_arango_worker._process_file(file, repo, args.db, node_q=_node_queue)', "test-insertion.prof", ) log.info(f"{r}") # except KeyboardInterrupt as e: # log.warn(f"stopping collection ...") finally: _node_queue.put(None) # repo.delete()
index = self._get_git_repo().index index.read() # file-level processing # files = [] with pushd(self._local_repo_path), Manager() as self._mp_manager: if not existing_node_q: self._node_queue = self._mp_manager.Queue() node_receiver = _tqdm_node_receiver(self._node_queue, self.en_manager_proxy) else: self._node_queue = existing_node_q with ProcessPool(max_workers=self._worker_count) as executor: self._stoppable = executor log.info(f"scanning git for files ...") ret_futures = [] cntr_add_jobs = self.en_manager.counter( desc=f"scanning files for {self._url_path}", total=len(index), autorefresh=True, leave=False) for gobj in index: if not gobj.mode in (git.GIT_FILEMODE_BLOB, git.GIT_FILEMODE_BLOB_EXECUTABLE, git.GIT_FILEMODE_LINK): continue _file = Path(gobj.path) # check size of file first: _fstat = _file.lstat()
"--language", type=str, help="Language to parse", required=True) parser.add_argument("file_path", type=str, help="File to parse") parser.add_argument( "-v", "--verbose", action="store_true", ) args = parser.parse_args() if args.verbose: log.setLevel(log.DEBUG) lang = TreeSitterAutoBuiltLanguage(args.language) tree = lang.parse_file(args.file_path) cur = tree.walk() cur = TreeSitterCursorIterator(cur) log.debug(cur) root = cur.peek() log.info(f"{cur.preorder:5d}:{' ' * cur.depth}{str_tsnode(root)}") for node in cur: # print(node) log.info(f"{cur.preorder:5d}:{' ' * cur.depth}{str_tsnode(node)}")
def batch_analyze(args): repo_list_file = Path(args.repo_list_file) if not repo_list_file.exists(): log.err(f"Input file not found: {args.repo_list_file}") try: with repo_list_file.open('r') as f: repolist = json.load(f) except Exception as e: log.err(f"Failed to read repo list file") raise client = ArangoClient(hosts=strip_url(args.db)) p = urlparse(args.db) db = client.db(p.path[1:], username=p.username, password=p.password) batch_id = uuid.uuid4().hex log.info(f"Batch ID {batch_id}") _mp_manager = Manager() node_q = _mp_manager.Queue() log.debug(f"checking {len(repolist)} items in repo list") try: multiprogress.main_proc_setup() multiprogress.start_server_thread() en_manager_proxy = multiprogress.get_manager_proxy() en_manager = multiprogress.get_manager() node_receiver = _tqdm_node_receiver(node_q, en_manager_proxy) with ProcessPool(max_workers=args.jobs) as executor: ret_futures = [] all_repos_sched_cntr = en_manager.counter(desc="adding repo jobs", total=len(repolist), unit='repos') for repo in repolist: ret_futures.append( executor.schedule(repo_worker, (repo, node_q), { 'workers': args.workers, 'database_conn': args.db })) all_repos_sched_cntr.update() all_repos_sched_cntr.close() all_repos_cntr = en_manager.counter(desc="repos in batch", total=len(repolist), unit='repos', autorefresh=True) try: for r in futures.as_completed(ret_futures): try: repo_dict, tr = r.result() except RepoExistsError as e: if args.skip_exists: log.debug(f"{e}") all_repos_cntr.update() continue else: log.err(f"{e}") raise e # save the original repo data to the db as well: tr.wst_extra = {"wst_batch": batch_id, **repo_dict} tr.update_in_db(db) all_repos_cntr.update() except KeyboardInterrupt as e: log.warn(f"stopping batch worker pool...") executor.stop() for rf in ret_futures: rf.cancel() log.warn(f"waiting for already started jobs to finish...") executor.join() finally: try: node_q.put(None) receiver_exit = node_receiver.result(timeout=1) except (BrokenPipeError, KeyboardInterrupt) as e: pass