def setup(self): """Clone the repo, connect to the DB, create working directories, etc.""" self._connect_db() repo = self._get_git_repo() if self._current_commit is None: log.warn( f"Deleting and re-cloning repo in {self._local_repo_path}") try: shutil.rmtree(self._local_repo_path) repo = self._get_git_repo() except Exception as e: log.error(f"Failed to repair repository: {type(e)}: {e}") raise e # to _target_commit if set if self._target_commit and self._target_commit != self._current_commit_hash: log.info(f"Checking out commit {self._target_commit}...") try: commit = repo.get(self._target_commit) log.debug(f"target commit {commit}") # commit might not exist for a variety of reasons (need to fetch, DNE, corrupt, etc) repo.checkout_tree(commit.tree) repo.head.set_target(commit.id) except Exception as e: raise e log.info( f"Repo at {self._local_repo_path} now at {self._current_commit_hash}" ) elif self._target_commit and self._target_commit == self._current_commit_hash: log.debug( f"Repo in {self._local_repo_path} is already at {self._target_commit}" )
def analyze(args): collector = WST_ArangoTreeCollector( args.repo_url, workers=args.workers, database_conn=args.db, commit_sha=args.target_commit, ) collector.setup() log.debug(f"Set up collector: {collector}") if args.interactive_debug: log.warn("Starting debugging:") bpdb.set_trace() try: collector.collect_all(overwrite_incomplete=args.overwrite_incomplete) except RepoExistsError as e: if args.skip_exists: log.warn( f"Skipping collection since repo document already present for commit {collector._current_commit_hash}" ) return else: raise except Exception as e: log.crit(f"{collector} run failed.") raise e
def database_init(args): client = ArangoClient(hosts=strip_url(args.db)) p = urlparse(args.db) odb = client.db(p.path[1:], username=p.username, password=p.password) if args.delete: log.warn(f"deleting all data ...") # deleting old stuff could take awhile jobs = [] db = odb.begin_async_execution() jobs.append( db.delete_graph(tree_models._graph_name, ignore_missing=True)) for c in _db_collections: jobs.append(db.delete_collection(c, ignore_missing=True)) for c in _db_edgecollections: jobs.append(db.delete_collection(c, ignore_missing=True)) jt_wait = len(jobs) while len(jobs) > 0: time.sleep(1) for j in jobs: if j.status() == 'done': jobs.remove(j) if jt_wait != len(jobs): log.debug(f"delete: waiting on {len(jobs)} jobs to finish ...") jt_wait = len(jobs) # back to non-async db = odb log.info(f"Creating collections ...") colls = {} for cn in _db_collections: if db.has_collection(cn): colls[cn] = db.collection(cn) else: colls[cn] = db.create_collection(cn, user_keys=True) for cn in _db_edgecollections: if db.has_collection(cn): colls[cn] = db.collection(cn) else: colls[cn] = db.create_collection(cn, user_keys=True, edge=True) graph = None if not db.has_graph(tree_models._graph_name): graph = db.create_graph(tree_models._graph_name) else: graph = db.graph(tree_models._graph_name) edgedefs = {} for gk, gv in _graph_edge_definitions.items(): if not graph.has_edge_definition(gv['edge_collection']): log.debug(f"Added graph edges {gv}") edgedefs[gk] = graph.create_edge_definition(**gv)
def collect_all(self, existing_node_q=None, overwrite_incomplete: bool = False): """Creates every node down the tree for this repo""" # create the main Repos self._tree_repo = WSTRepository( type='git', url=self.repo_url, path=self._url_path, analyzed_time=int(time.time()), wst_status="started", ) # self._coll['wstrepos'].insert(nr.__dict__) try: self._tree_repo.insert_in_db(self._db) except arango.exceptions.DocumentInsertError as e: if e.http_code == 409: existing_repo = WSTRepository.get(self._db, self._tree_repo._key) if overwrite_incomplete and existing_repo.wst_status != "completed": log.warn( f"Overwriting prior WSTRespository, status was '{existing_repo.wst_status}'" ) self._tree_repo.update_in_db(self._db) else: raise RepoExistsError(f"Already present: {existing_repo}") else: raise e # attempt to find an existing commit in the db: if not (commit := WSTCommit.get(self._db, self._current_commit_hash)): _cc = self._current_commit self._wst_commit = WSTCommit( _key=_cc.hex, commit_time=_cc.commit_time, commit_time_offset=_cc.commit_time_offset, parent_ids=[str(i) for i in _cc.parent_ids], tree_id=str(_cc.tree_id), ) log.debug(f"Inserting {self._wst_commit}") self._wst_commit.insert_in_db(self._db)
def __main__(): parser = argparse.ArgumentParser() parser.add_argument("--db", "--database", type=str, help="Database connection string", default=os.environ.get( 'WST_DB_URI', "http://*****:*****@localhost:8529/wst")) parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_true") parser.set_defaults(en_manager=enlighten.get_manager()) subcmds = parser.add_subparsers(title="Collector commands") # analysis cmd_analyze = subcmds.add_parser('analyze', aliases=['add', 'a'], help="Analyze repositories") cmd_analyze.set_defaults(func=analyze) cmd_analyze.add_argument("repo_url", type=str, help="URI for cloning the repository") cmd_analyze.add_argument( "-w", "--workers", type=int, help= "Number of workers to use for processing files, default: os.cpu_count()", default=None) cmd_analyze.add_argument( "--skip-exists", "--skip-existing", action="store_true", help= "Skip the analysis if the repo document already exists in the database" ) cmd_analyze.add_argument( "--interactive-debug", action="store_true", help="Start the interactive debugger after repo setup") cmd_analyze.add_argument( "--overwrite-incomplete", action="store_true", help="Overwrite existing but incomplete / unfinished data in the DB") cmd_analyze.add_argument( "-t", "--target-commit", type=str, help="Checkout and analyze a specific commit from the repo", default=None) # batch analysis cmd_batch = subcmds.add_parser( 'batch', aliases=['addbatch', 'addmulti'], help="Analyze multiple repos from a JSON specification list") set_batch_analyze_args(cmd_batch) # delete data selectively cmd_delete = subcmds.add_parser('delete', aliases=['del'], help="Delete tree data selectively") cmd_delete.set_defaults(func=delete) cmd_delete.add_argument( "which_repo", type=str, help="URI or commit SHA for which repo's data to delete") # db setup cmd_db = subcmds.add_parser('db', aliases=['database'], help="Manage the database") subcmds_db = cmd_db.add_subparsers(title="Manage the database") cmd_db_init = subcmds_db.add_parser('initialize', aliases=['init', 'setup'], help="Set up the database") cmd_db_init.set_defaults(func=database_init) cmd_db_init.add_argument( "-d", "--delete", help="Delete any existing data in the database", action="store_true", ) args = parser.parse_args() if args.verbose: log.setLevel(log.DEBUG) log.debug("Verbose logging enabled.") log.info(f"DB connection: {desensitize_url(args.db)}") if 'func' not in args: log.warn(f"Please supply a valid subcommand!") return try: args.func(args) except KeyboardInterrupt as e: log.warn(f"Stopping all child processes...") cur_proc = psutil.Process() children = cur_proc.children(recursive=True) for c in children: os.kill(c.pid, signal.SIGINT) psutil.wait_procs(children, timeout=5) children = cur_proc.children(recursive=True) for c in children: c.terminate() raise e
cntr_files_processed = self.en_manager.counter( desc=f"processing {self._url_path}", total=len(ret_futures), unit="files", leave=False, autorefresh=True) for r in futures.as_completed(ret_futures): completed_file = r.result() # log.debug(f"result {nf}") cntr_files_processed.update() # after all results returned self._tree_repo.wst_status = "completed" self._tree_repo.update_in_db(self._db) log.info(f"{self._url_path} marked completed.") except KeyboardInterrupt as e: log.warn(f"stopping collection ...") for rf in ret_futures: rf.cancel() executor.close() executor.join(5) executor.stop() # raise e self._tree_repo.wst_status = "cancelled" self._tree_repo.update_in_db(self._db) log.info( f"{self._tree_repo.url} wst_status marked as cancelled" ) except Exception as e: self._tree_repo.wst_status = "error" self._tree_repo.update_in_db(self._db) raise e
def batch_analyze(args): repo_list_file = Path(args.repo_list_file) if not repo_list_file.exists(): log.err(f"Input file not found: {args.repo_list_file}") try: with repo_list_file.open('r') as f: repolist = json.load(f) except Exception as e: log.err(f"Failed to read repo list file") raise client = ArangoClient(hosts=strip_url(args.db)) p = urlparse(args.db) db = client.db(p.path[1:], username=p.username, password=p.password) batch_id = uuid.uuid4().hex log.info(f"Batch ID {batch_id}") _mp_manager = Manager() node_q = _mp_manager.Queue() log.debug(f"checking {len(repolist)} items in repo list") try: multiprogress.main_proc_setup() multiprogress.start_server_thread() en_manager_proxy = multiprogress.get_manager_proxy() en_manager = multiprogress.get_manager() node_receiver = _tqdm_node_receiver(node_q, en_manager_proxy) with ProcessPool(max_workers=args.jobs) as executor: ret_futures = [] all_repos_sched_cntr = en_manager.counter(desc="adding repo jobs", total=len(repolist), unit='repos') for repo in repolist: ret_futures.append( executor.schedule(repo_worker, (repo, node_q), { 'workers': args.workers, 'database_conn': args.db })) all_repos_sched_cntr.update() all_repos_sched_cntr.close() all_repos_cntr = en_manager.counter(desc="repos in batch", total=len(repolist), unit='repos', autorefresh=True) try: for r in futures.as_completed(ret_futures): try: repo_dict, tr = r.result() except RepoExistsError as e: if args.skip_exists: log.debug(f"{e}") all_repos_cntr.update() continue else: log.err(f"{e}") raise e # save the original repo data to the db as well: tr.wst_extra = {"wst_batch": batch_id, **repo_dict} tr.update_in_db(db) all_repos_cntr.update() except KeyboardInterrupt as e: log.warn(f"stopping batch worker pool...") executor.stop() for rf in ret_futures: rf.cancel() log.warn(f"waiting for already started jobs to finish...") executor.join() finally: try: node_q.put(None) receiver_exit = node_receiver.result(timeout=1) except (BrokenPipeError, KeyboardInterrupt) as e: pass
else: raise UnhandledGitFileMode(f"{file.path} mode is {oct(file.mode)}") try: file.insert_in_db(db) (wst_commit / file).insert_in_db(db) # commit -> file except arango.exceptions.DocumentInsertError as e: if e.http_code == 409: # already exists: get it preexisting_file = WSTFile.get(db, file._key) if preexisting_file != file: log.debug(f"existing file: {preexisting_file}") log.debug(f"new file: {file}") if overwrite_errored_docs and preexisting_file.error: log.warn( f"Overwriting errored WSTFile, prior error: {preexisting_file.error}, new error: {file.error}" ) file.update_in_db(db) (wst_commit / file).insert_in_db( db, overwrite=True) # commit -> file else: raise PrerequisiteStateInvalid( f"WSTFile {file._key} already exists but has mismatched data" ) else: # WSTFiles are equivalent, dedup (wst_commit / preexisting_file).insert_in_db( db, overwrite=overwrite_errored_docs) if node_q: node_q.put(('dedup_stats', 'WSTFile', 1)) return preexisting_file else: