コード例 #1
0
 def setup(self):
     """Clone the repo, connect to the DB, create working directories, etc."""
     self._connect_db()
     repo = self._get_git_repo()
     if self._current_commit is None:
         log.warn(
             f"Deleting and re-cloning repo in {self._local_repo_path}")
         try:
             shutil.rmtree(self._local_repo_path)
             repo = self._get_git_repo()
         except Exception as e:
             log.error(f"Failed to repair repository: {type(e)}: {e}")
             raise e
     # to _target_commit if set
     if self._target_commit and self._target_commit != self._current_commit_hash:
         log.info(f"Checking out commit {self._target_commit}...")
         try:
             commit = repo.get(self._target_commit)
             log.debug(f"target commit {commit}")
             # commit might not exist for a variety of reasons (need to fetch, DNE, corrupt, etc)
             repo.checkout_tree(commit.tree)
             repo.head.set_target(commit.id)
         except Exception as e:
             raise e
         log.info(
             f"Repo at {self._local_repo_path} now at {self._current_commit_hash}"
         )
     elif self._target_commit and self._target_commit == self._current_commit_hash:
         log.debug(
             f"Repo in {self._local_repo_path} is already at {self._target_commit}"
         )
コード例 #2
0
ファイル: __main__.py プロジェクト: utk-se/WorldSyntaxTree
def analyze(args):
    collector = WST_ArangoTreeCollector(
        args.repo_url,
        workers=args.workers,
        database_conn=args.db,
        commit_sha=args.target_commit,
    )
    collector.setup()
    log.debug(f"Set up collector: {collector}")

    if args.interactive_debug:
        log.warn("Starting debugging:")
        bpdb.set_trace()

    try:
        collector.collect_all(overwrite_incomplete=args.overwrite_incomplete)
    except RepoExistsError as e:
        if args.skip_exists:
            log.warn(
                f"Skipping collection since repo document already present for commit {collector._current_commit_hash}"
            )
            return
        else:
            raise
    except Exception as e:
        log.crit(f"{collector} run failed.")
        raise e
コード例 #3
0
ファイル: __main__.py プロジェクト: utk-se/WorldSyntaxTree
def database_init(args):
    client = ArangoClient(hosts=strip_url(args.db))
    p = urlparse(args.db)
    odb = client.db(p.path[1:], username=p.username, password=p.password)

    if args.delete:
        log.warn(f"deleting all data ...")
        # deleting old stuff could take awhile
        jobs = []
        db = odb.begin_async_execution()

        jobs.append(
            db.delete_graph(tree_models._graph_name, ignore_missing=True))
        for c in _db_collections:
            jobs.append(db.delete_collection(c, ignore_missing=True))
        for c in _db_edgecollections:
            jobs.append(db.delete_collection(c, ignore_missing=True))

        jt_wait = len(jobs)
        while len(jobs) > 0:
            time.sleep(1)
            for j in jobs:
                if j.status() == 'done':
                    jobs.remove(j)
            if jt_wait != len(jobs):
                log.debug(f"delete: waiting on {len(jobs)} jobs to finish ...")
                jt_wait = len(jobs)

    # back to non-async
    db = odb

    log.info(f"Creating collections ...")

    colls = {}
    for cn in _db_collections:
        if db.has_collection(cn):
            colls[cn] = db.collection(cn)
        else:
            colls[cn] = db.create_collection(cn, user_keys=True)
    for cn in _db_edgecollections:
        if db.has_collection(cn):
            colls[cn] = db.collection(cn)
        else:
            colls[cn] = db.create_collection(cn, user_keys=True, edge=True)

    graph = None
    if not db.has_graph(tree_models._graph_name):
        graph = db.create_graph(tree_models._graph_name)
    else:
        graph = db.graph(tree_models._graph_name)
    edgedefs = {}

    for gk, gv in _graph_edge_definitions.items():
        if not graph.has_edge_definition(gv['edge_collection']):
            log.debug(f"Added graph edges {gv}")
            edgedefs[gk] = graph.create_edge_definition(**gv)
コード例 #4
0
    def collect_all(self,
                    existing_node_q=None,
                    overwrite_incomplete: bool = False):
        """Creates every node down the tree for this repo"""
        # create the main Repos
        self._tree_repo = WSTRepository(
            type='git',
            url=self.repo_url,
            path=self._url_path,
            analyzed_time=int(time.time()),
            wst_status="started",
        )
        # self._coll['wstrepos'].insert(nr.__dict__)
        try:
            self._tree_repo.insert_in_db(self._db)
        except arango.exceptions.DocumentInsertError as e:
            if e.http_code == 409:
                existing_repo = WSTRepository.get(self._db,
                                                  self._tree_repo._key)
                if overwrite_incomplete and existing_repo.wst_status != "completed":
                    log.warn(
                        f"Overwriting prior WSTRespository, status was '{existing_repo.wst_status}'"
                    )
                    self._tree_repo.update_in_db(self._db)
                else:
                    raise RepoExistsError(f"Already present: {existing_repo}")
            else:
                raise e

        # attempt to find an existing commit in the db:
        if not (commit := WSTCommit.get(self._db, self._current_commit_hash)):
            _cc = self._current_commit
            self._wst_commit = WSTCommit(
                _key=_cc.hex,
                commit_time=_cc.commit_time,
                commit_time_offset=_cc.commit_time_offset,
                parent_ids=[str(i) for i in _cc.parent_ids],
                tree_id=str(_cc.tree_id),
            )
            log.debug(f"Inserting {self._wst_commit}")
            self._wst_commit.insert_in_db(self._db)
コード例 #5
0
ファイル: __main__.py プロジェクト: utk-se/WorldSyntaxTree
def __main__():
    parser = argparse.ArgumentParser()

    parser.add_argument("--db",
                        "--database",
                        type=str,
                        help="Database connection string",
                        default=os.environ.get(
                            'WST_DB_URI', "http://*****:*****@localhost:8529/wst"))
    parser.add_argument("-v",
                        "--verbose",
                        help="Increase output verbosity",
                        action="store_true")
    parser.set_defaults(en_manager=enlighten.get_manager())
    subcmds = parser.add_subparsers(title="Collector commands")

    # analysis
    cmd_analyze = subcmds.add_parser('analyze',
                                     aliases=['add', 'a'],
                                     help="Analyze repositories")
    cmd_analyze.set_defaults(func=analyze)
    cmd_analyze.add_argument("repo_url",
                             type=str,
                             help="URI for cloning the repository")
    cmd_analyze.add_argument(
        "-w",
        "--workers",
        type=int,
        help=
        "Number of workers to use for processing files, default: os.cpu_count()",
        default=None)
    cmd_analyze.add_argument(
        "--skip-exists",
        "--skip-existing",
        action="store_true",
        help=
        "Skip the analysis if the repo document already exists in the database"
    )
    cmd_analyze.add_argument(
        "--interactive-debug",
        action="store_true",
        help="Start the interactive debugger after repo setup")
    cmd_analyze.add_argument(
        "--overwrite-incomplete",
        action="store_true",
        help="Overwrite existing but incomplete / unfinished data in the DB")
    cmd_analyze.add_argument(
        "-t",
        "--target-commit",
        type=str,
        help="Checkout and analyze a specific commit from the repo",
        default=None)
    # batch analysis
    cmd_batch = subcmds.add_parser(
        'batch',
        aliases=['addbatch', 'addmulti'],
        help="Analyze multiple repos from a JSON specification list")
    set_batch_analyze_args(cmd_batch)
    # delete data selectively
    cmd_delete = subcmds.add_parser('delete',
                                    aliases=['del'],
                                    help="Delete tree data selectively")
    cmd_delete.set_defaults(func=delete)
    cmd_delete.add_argument(
        "which_repo",
        type=str,
        help="URI or commit SHA for which repo's data to delete")
    # db setup
    cmd_db = subcmds.add_parser('db',
                                aliases=['database'],
                                help="Manage the database")
    subcmds_db = cmd_db.add_subparsers(title="Manage the database")
    cmd_db_init = subcmds_db.add_parser('initialize',
                                        aliases=['init', 'setup'],
                                        help="Set up the database")
    cmd_db_init.set_defaults(func=database_init)
    cmd_db_init.add_argument(
        "-d",
        "--delete",
        help="Delete any existing data in the database",
        action="store_true",
    )
    args = parser.parse_args()

    if args.verbose:
        log.setLevel(log.DEBUG)
        log.debug("Verbose logging enabled.")

    log.info(f"DB connection: {desensitize_url(args.db)}")

    if 'func' not in args:
        log.warn(f"Please supply a valid subcommand!")
        return

    try:
        args.func(args)
    except KeyboardInterrupt as e:
        log.warn(f"Stopping all child processes...")
        cur_proc = psutil.Process()
        children = cur_proc.children(recursive=True)
        for c in children:
            os.kill(c.pid, signal.SIGINT)
        psutil.wait_procs(children, timeout=5)
        children = cur_proc.children(recursive=True)
        for c in children:
            c.terminate()
        raise e
コード例 #6
0
     cntr_files_processed = self.en_manager.counter(
         desc=f"processing {self._url_path}",
         total=len(ret_futures),
         unit="files",
         leave=False,
         autorefresh=True)
     for r in futures.as_completed(ret_futures):
         completed_file = r.result()
         # log.debug(f"result {nf}")
         cntr_files_processed.update()
     # after all results returned
     self._tree_repo.wst_status = "completed"
     self._tree_repo.update_in_db(self._db)
     log.info(f"{self._url_path} marked completed.")
 except KeyboardInterrupt as e:
     log.warn(f"stopping collection ...")
     for rf in ret_futures:
         rf.cancel()
     executor.close()
     executor.join(5)
     executor.stop()
     # raise e
     self._tree_repo.wst_status = "cancelled"
     self._tree_repo.update_in_db(self._db)
     log.info(
         f"{self._tree_repo.url} wst_status marked as cancelled"
     )
 except Exception as e:
     self._tree_repo.wst_status = "error"
     self._tree_repo.update_in_db(self._db)
     raise e
コード例 #7
0
def batch_analyze(args):
    repo_list_file = Path(args.repo_list_file)
    if not repo_list_file.exists():
        log.err(f"Input file not found: {args.repo_list_file}")
    try:
        with repo_list_file.open('r') as f:
            repolist = json.load(f)
    except Exception as e:
        log.err(f"Failed to read repo list file")
        raise

    client = ArangoClient(hosts=strip_url(args.db))
    p = urlparse(args.db)
    db = client.db(p.path[1:], username=p.username, password=p.password)
    batch_id = uuid.uuid4().hex
    log.info(f"Batch ID {batch_id}")
    _mp_manager = Manager()
    node_q = _mp_manager.Queue()

    log.debug(f"checking {len(repolist)} items in repo list")

    try:
        multiprogress.main_proc_setup()
        multiprogress.start_server_thread()
        en_manager_proxy = multiprogress.get_manager_proxy()
        en_manager = multiprogress.get_manager()
        node_receiver = _tqdm_node_receiver(node_q, en_manager_proxy)

        with ProcessPool(max_workers=args.jobs) as executor:
            ret_futures = []
            all_repos_sched_cntr = en_manager.counter(desc="adding repo jobs",
                                                      total=len(repolist),
                                                      unit='repos')
            for repo in repolist:
                ret_futures.append(
                    executor.schedule(repo_worker, (repo, node_q), {
                        'workers': args.workers,
                        'database_conn': args.db
                    }))
                all_repos_sched_cntr.update()
            all_repos_sched_cntr.close()
            all_repos_cntr = en_manager.counter(desc="repos in batch",
                                                total=len(repolist),
                                                unit='repos',
                                                autorefresh=True)
            try:
                for r in futures.as_completed(ret_futures):
                    try:
                        repo_dict, tr = r.result()
                    except RepoExistsError as e:
                        if args.skip_exists:
                            log.debug(f"{e}")
                            all_repos_cntr.update()
                            continue
                        else:
                            log.err(f"{e}")
                            raise e
                    # save the original repo data to the db as well:
                    tr.wst_extra = {"wst_batch": batch_id, **repo_dict}
                    tr.update_in_db(db)
                    all_repos_cntr.update()
            except KeyboardInterrupt as e:
                log.warn(f"stopping batch worker pool...")
                executor.stop()
                for rf in ret_futures:
                    rf.cancel()
                log.warn(f"waiting for already started jobs to finish...")
                executor.join()
    finally:
        try:
            node_q.put(None)
            receiver_exit = node_receiver.result(timeout=1)
        except (BrokenPipeError, KeyboardInterrupt) as e:
            pass
コード例 #8
0
    else:
        raise UnhandledGitFileMode(f"{file.path} mode is {oct(file.mode)}")

    try:
        file.insert_in_db(db)
        (wst_commit / file).insert_in_db(db)  # commit -> file
    except arango.exceptions.DocumentInsertError as e:
        if e.http_code == 409:
            # already exists: get it
            preexisting_file = WSTFile.get(db, file._key)
            if preexisting_file != file:
                log.debug(f"existing file: {preexisting_file}")
                log.debug(f"new file: {file}")
                if overwrite_errored_docs and preexisting_file.error:
                    log.warn(
                        f"Overwriting errored WSTFile, prior error: {preexisting_file.error}, new error: {file.error}"
                    )
                    file.update_in_db(db)
                    (wst_commit / file).insert_in_db(
                        db, overwrite=True)  # commit -> file
                else:
                    raise PrerequisiteStateInvalid(
                        f"WSTFile {file._key} already exists but has mismatched data"
                    )
            else:  # WSTFiles are equivalent, dedup
                (wst_commit / preexisting_file).insert_in_db(
                    db, overwrite=overwrite_errored_docs)
                if node_q:
                    node_q.put(('dedup_stats', 'WSTFile', 1))
                return preexisting_file
        else: