Esempio n. 1
0
 def setup(self):
     """Clone the repo, connect to the DB, create working directories, etc."""
     self._connect_db()
     repo = self._get_git_repo()
     if self._current_commit is None:
         log.warn(
             f"Deleting and re-cloning repo in {self._local_repo_path}")
         try:
             shutil.rmtree(self._local_repo_path)
             repo = self._get_git_repo()
         except Exception as e:
             log.error(f"Failed to repair repository: {type(e)}: {e}")
             raise e
     # to _target_commit if set
     if self._target_commit and self._target_commit != self._current_commit_hash:
         log.info(f"Checking out commit {self._target_commit}...")
         try:
             commit = repo.get(self._target_commit)
             log.debug(f"target commit {commit}")
             # commit might not exist for a variety of reasons (need to fetch, DNE, corrupt, etc)
             repo.checkout_tree(commit.tree)
             repo.head.set_target(commit.id)
         except Exception as e:
             raise e
         log.info(
             f"Repo at {self._local_repo_path} now at {self._current_commit_hash}"
         )
     elif self._target_commit and self._target_commit == self._current_commit_hash:
         log.debug(
             f"Repo in {self._local_repo_path} is already at {self._target_commit}"
         )
Esempio n. 2
0
 def _get_git_repo(self):
     repodir = self._local_repo_path
     if not (repodir / '.git').exists():
         repodir.mkdir(mode=0o770, parents=True, exist_ok=True)
         log.info(f"cloning repo to {repodir} ...")
         return git.clone_repository(self.repo_url, repodir.resolve())
     else:
         repopath = git.discover_repository(repodir.resolve())
         return git.Repository(repopath)
Esempio n. 3
0
def database_init(args):
    client = ArangoClient(hosts=strip_url(args.db))
    p = urlparse(args.db)
    odb = client.db(p.path[1:], username=p.username, password=p.password)

    if args.delete:
        log.warn(f"deleting all data ...")
        # deleting old stuff could take awhile
        jobs = []
        db = odb.begin_async_execution()

        jobs.append(
            db.delete_graph(tree_models._graph_name, ignore_missing=True))
        for c in _db_collections:
            jobs.append(db.delete_collection(c, ignore_missing=True))
        for c in _db_edgecollections:
            jobs.append(db.delete_collection(c, ignore_missing=True))

        jt_wait = len(jobs)
        while len(jobs) > 0:
            time.sleep(1)
            for j in jobs:
                if j.status() == 'done':
                    jobs.remove(j)
            if jt_wait != len(jobs):
                log.debug(f"delete: waiting on {len(jobs)} jobs to finish ...")
                jt_wait = len(jobs)

    # back to non-async
    db = odb

    log.info(f"Creating collections ...")

    colls = {}
    for cn in _db_collections:
        if db.has_collection(cn):
            colls[cn] = db.collection(cn)
        else:
            colls[cn] = db.create_collection(cn, user_keys=True)
    for cn in _db_edgecollections:
        if db.has_collection(cn):
            colls[cn] = db.collection(cn)
        else:
            colls[cn] = db.create_collection(cn, user_keys=True, edge=True)

    graph = None
    if not db.has_graph(tree_models._graph_name):
        graph = db.create_graph(tree_models._graph_name)
    else:
        graph = db.graph(tree_models._graph_name)
    edgedefs = {}

    for gk, gv in _graph_edge_definitions.items():
        if not graph.has_edge_definition(gv['edge_collection']):
            log.debug(f"Added graph edges {gv}")
            edgedefs[gk] = graph.create_edge_definition(**gv)
Esempio n. 4
0
def __main__():
    parser = argparse.ArgumentParser()

    parser.add_argument("query", type=str, help="S-exp query to execute")
    parser.add_argument("-v",
                        "--verbose",
                        help="Increase output verbosity",
                        action="store_true")
    parser.add_argument("--node-text",
                        help="Show the text content of the matched nodes",
                        action="store_true")
    args = parser.parse_args()

    if args.verbose:
        log.setLevel(log.DEBUG)
        log.debug("Verbose logging enabled.")

    parsed_s_query = sexp.parseString(args.query)

    log.debug(parsed_s_query)

    r = find_nodes_by_query(parsed_s_query)
    rl = []
    for n in r:
        log.info(f"{n} in {n.file.fetch()}")
        n_sexp = node_as_sexp(n, maxdepth=3, indent=2, show_start_coords=True)
        log.info(f"{n_sexp}")
        if args.node_text:
            log.info(f"{n.text.fetch()}")
        rl.append(n)

    log.info(f"{len(rl)} results returned")
Esempio n. 5
0
def _tqdm_node_receiver(q, en_manager):
    """This is the cross-process aggregator for non-required data

    Even without this process the collection and analysis should run normally.
    It's mostly just used for debugging and informational output.
    """
    try:
        log.debug(f"start counting db inserts...")
        n = 0
        cache_stats = {
            "text_lfu_hit": 0,
            "text_lfu_miss": 0,
        }
        dedup_stats = {}
        cntr = en_manager.counter(desc="writing to db",
                                  position=1,
                                  unit='docs',
                                  autorefresh=True)
        # with tqdm(desc="writing documents to db", position=1, unit='docs', unit_scale=True) as tbar:
        while (nc := q.get()) is not None:
            if type(nc) == int:
                n += nc
                cntr.update(nc)
            elif nc[0] == "cache_stats":
                for k, v in nc[1].items():
                    cache_stats[k] += v
            elif nc[0] == "dedup_stats":
                if nc[1] not in dedup_stats:
                    dedup_stats[nc[1]] = 0
                dedup_stats[nc[1]] += nc[2]
            else:
                log.error(
                    f"node receiver process got invalid data sent of type {type(nc)}"
                )
        log.info(f"stopped counting nodes, total documents inserted: {n}")
        cache_text_lfu_ratio = cache_stats["text_lfu_hit"] / (
            cache_stats["text_lfu_miss"] or 1)
        log.debug(
            f"text_lfu cache stats: ratio {cache_text_lfu_ratio}, hit {cache_stats['text_lfu_hit']}"
        )
        return True
def slow_worker(name, orig_jobitems, itemtime, en_manager):
    """I WANT:
    A function I can call to retrieve a proxy for a counter
    """
    jobitems = random.randrange(orig_jobitems // 2, orig_jobitems * 2)
    try:
        cntr = en_manager.counter(desc=f"job {name}",
                                  total=jobitems,
                                  leave=False)
    except Exception as e:
        log.err(f"{type(e)}: {e}")
        raise

    log.info(f"job {name} started")

    for _ in range(jobitems):
        time.sleep(random.uniform(itemtime * 0.1, itemtime * 1.5))
        cntr.update()

    cntr.close()

    log.info(f"job {name} completed")
    return jobitems
def __main__():
    parser = argparse.ArgumentParser()

    parser.add_argument("--nworkers",
                        type=int,
                        help="number of workers",
                        default=os.cpu_count())
    parser.add_argument("--jobitems",
                        type=int,
                        help="items in a single job",
                        default=200)
    parser.add_argument("--njobs",
                        type=int,
                        help="number of total jobs to complete",
                        default=200)
    parser.add_argument("--itemtime",
                        type=float,
                        help="time taken per item in a job",
                        default=0.1)
    parser.add_argument("-v",
                        "--verbose",
                        help="Increase output verbosity",
                        action="store_true")
    args = parser.parse_args()

    if args.verbose:
        log.setLevel(log.DEBUG)
        log.debug("Verbose logging enabled.")

    with ProcessPool(max_workers=args.nworkers) as executor:
        multiprogress.main_proc_setup()
        multiprogress.start_server_thread()

        en_manager_proxy = multiprogress.get_manager_proxy()
        en_manager = multiprogress.get_manager()

        ret_futures = []
        # log.debug(f"counter_generator: {repr(counter_generator)}")
        log.info(f"Starting jobs...")
        for i in range(args.njobs):
            ret_futures.append(
                executor.schedule(
                    slow_worker,
                    (i, args.jobitems, args.itemtime, en_manager_proxy)))
        log.info(f"Waiting for jobs to complete...")
        cntr_all_jobs = en_manager.counter(desc="all jobs",
                                           total=args.njobs,
                                           color='blue')
        log.debug(f"cntr_all_jobs: {repr(cntr_all_jobs)}")
        for f in futures.as_completed(ret_futures):
            f.result()
            log.debug(f"finished a job!")
            cntr_all_jobs.update()
        log.info(f"All jobs completed!")
Esempio n. 8
0
def __main__():
    parser = argparse.ArgumentParser()

    parser.add_argument("--db",
                        "--database",
                        type=str,
                        help="Database connection string",
                        default=os.environ.get(
                            'WST_DB_URI', "http://*****:*****@localhost:8529/wst"))
    parser.add_argument("-v",
                        "--verbose",
                        help="Increase output verbosity",
                        action="store_true")
    parser.set_defaults(en_manager=enlighten.get_manager())
    subcmds = parser.add_subparsers(title="Collector commands")

    # analysis
    cmd_analyze = subcmds.add_parser('analyze',
                                     aliases=['add', 'a'],
                                     help="Analyze repositories")
    cmd_analyze.set_defaults(func=analyze)
    cmd_analyze.add_argument("repo_url",
                             type=str,
                             help="URI for cloning the repository")
    cmd_analyze.add_argument(
        "-w",
        "--workers",
        type=int,
        help=
        "Number of workers to use for processing files, default: os.cpu_count()",
        default=None)
    cmd_analyze.add_argument(
        "--skip-exists",
        "--skip-existing",
        action="store_true",
        help=
        "Skip the analysis if the repo document already exists in the database"
    )
    cmd_analyze.add_argument(
        "--interactive-debug",
        action="store_true",
        help="Start the interactive debugger after repo setup")
    cmd_analyze.add_argument(
        "--overwrite-incomplete",
        action="store_true",
        help="Overwrite existing but incomplete / unfinished data in the DB")
    cmd_analyze.add_argument(
        "-t",
        "--target-commit",
        type=str,
        help="Checkout and analyze a specific commit from the repo",
        default=None)
    # batch analysis
    cmd_batch = subcmds.add_parser(
        'batch',
        aliases=['addbatch', 'addmulti'],
        help="Analyze multiple repos from a JSON specification list")
    set_batch_analyze_args(cmd_batch)
    # delete data selectively
    cmd_delete = subcmds.add_parser('delete',
                                    aliases=['del'],
                                    help="Delete tree data selectively")
    cmd_delete.set_defaults(func=delete)
    cmd_delete.add_argument(
        "which_repo",
        type=str,
        help="URI or commit SHA for which repo's data to delete")
    # db setup
    cmd_db = subcmds.add_parser('db',
                                aliases=['database'],
                                help="Manage the database")
    subcmds_db = cmd_db.add_subparsers(title="Manage the database")
    cmd_db_init = subcmds_db.add_parser('initialize',
                                        aliases=['init', 'setup'],
                                        help="Set up the database")
    cmd_db_init.set_defaults(func=database_init)
    cmd_db_init.add_argument(
        "-d",
        "--delete",
        help="Delete any existing data in the database",
        action="store_true",
    )
    args = parser.parse_args()

    if args.verbose:
        log.setLevel(log.DEBUG)
        log.debug("Verbose logging enabled.")

    log.info(f"DB connection: {desensitize_url(args.db)}")

    if 'func' not in args:
        log.warn(f"Please supply a valid subcommand!")
        return

    try:
        args.func(args)
    except KeyboardInterrupt as e:
        log.warn(f"Stopping all child processes...")
        cur_proc = psutil.Process()
        children = cur_proc.children(recursive=True)
        for c in children:
            os.kill(c.pid, signal.SIGINT)
        psutil.wait_procs(children, timeout=5)
        children = cur_proc.children(recursive=True)
        for c in children:
            c.terminate()
        raise e
Esempio n. 9
0
        _key="wst0test0461b1c841f897cbd952354370471a64",
        type='test',
        url=f"wst.tests.insertion/{test_id}",
        commit="wst0test0461b1c841f897cbd952354370471a64",
        path=f"wst/tests/{test_id}",
    )
    repo.insert_in_db(db)
    file = WSTFile(
        _key="wst0test0461b1c841f897cbd952354370471a64-0",
        oid="testwst0",
        path=args.file_path,
        language=args.language,
    )

    with Manager() as _mp_manager:
        _node_queue = _mp_manager.Queue()
        node_receiver = wst_arango_worker._tqdm_node_receiver(_node_queue)

        try:
            r = cProfile.run(
                f'wst_arango_worker._process_file(file, repo, args.db, node_q=_node_queue)',
                "test-insertion.prof",
            )
            log.info(f"{r}")
        # except KeyboardInterrupt as e:
        #     log.warn(f"stopping collection ...")
        finally:
            _node_queue.put(None)

    # repo.delete()
Esempio n. 10
0
        index = self._get_git_repo().index
        index.read()

        # file-level processing
        # files = []
        with pushd(self._local_repo_path), Manager() as self._mp_manager:
            if not existing_node_q:
                self._node_queue = self._mp_manager.Queue()
                node_receiver = _tqdm_node_receiver(self._node_queue,
                                                    self.en_manager_proxy)
            else:
                self._node_queue = existing_node_q
            with ProcessPool(max_workers=self._worker_count) as executor:
                self._stoppable = executor
                log.info(f"scanning git for files ...")
                ret_futures = []
                cntr_add_jobs = self.en_manager.counter(
                    desc=f"scanning files for {self._url_path}",
                    total=len(index),
                    autorefresh=True,
                    leave=False)
                for gobj in index:
                    if not gobj.mode in (git.GIT_FILEMODE_BLOB,
                                         git.GIT_FILEMODE_BLOB_EXECUTABLE,
                                         git.GIT_FILEMODE_LINK):
                        continue
                    _file = Path(gobj.path)
                    # check size of file first:
                    _fstat = _file.lstat()
Esempio n. 11
0
                        "--language",
                        type=str,
                        help="Language to parse",
                        required=True)
    parser.add_argument("file_path", type=str, help="File to parse")
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
    )

    args = parser.parse_args()
    if args.verbose:
        log.setLevel(log.DEBUG)

    lang = TreeSitterAutoBuiltLanguage(args.language)

    tree = lang.parse_file(args.file_path)

    cur = tree.walk()
    cur = TreeSitterCursorIterator(cur)

    log.debug(cur)

    root = cur.peek()
    log.info(f"{cur.preorder:5d}:{' ' * cur.depth}{str_tsnode(root)}")

    for node in cur:
        # print(node)
        log.info(f"{cur.preorder:5d}:{' ' * cur.depth}{str_tsnode(node)}")
Esempio n. 12
0
def batch_analyze(args):
    repo_list_file = Path(args.repo_list_file)
    if not repo_list_file.exists():
        log.err(f"Input file not found: {args.repo_list_file}")
    try:
        with repo_list_file.open('r') as f:
            repolist = json.load(f)
    except Exception as e:
        log.err(f"Failed to read repo list file")
        raise

    client = ArangoClient(hosts=strip_url(args.db))
    p = urlparse(args.db)
    db = client.db(p.path[1:], username=p.username, password=p.password)
    batch_id = uuid.uuid4().hex
    log.info(f"Batch ID {batch_id}")
    _mp_manager = Manager()
    node_q = _mp_manager.Queue()

    log.debug(f"checking {len(repolist)} items in repo list")

    try:
        multiprogress.main_proc_setup()
        multiprogress.start_server_thread()
        en_manager_proxy = multiprogress.get_manager_proxy()
        en_manager = multiprogress.get_manager()
        node_receiver = _tqdm_node_receiver(node_q, en_manager_proxy)

        with ProcessPool(max_workers=args.jobs) as executor:
            ret_futures = []
            all_repos_sched_cntr = en_manager.counter(desc="adding repo jobs",
                                                      total=len(repolist),
                                                      unit='repos')
            for repo in repolist:
                ret_futures.append(
                    executor.schedule(repo_worker, (repo, node_q), {
                        'workers': args.workers,
                        'database_conn': args.db
                    }))
                all_repos_sched_cntr.update()
            all_repos_sched_cntr.close()
            all_repos_cntr = en_manager.counter(desc="repos in batch",
                                                total=len(repolist),
                                                unit='repos',
                                                autorefresh=True)
            try:
                for r in futures.as_completed(ret_futures):
                    try:
                        repo_dict, tr = r.result()
                    except RepoExistsError as e:
                        if args.skip_exists:
                            log.debug(f"{e}")
                            all_repos_cntr.update()
                            continue
                        else:
                            log.err(f"{e}")
                            raise e
                    # save the original repo data to the db as well:
                    tr.wst_extra = {"wst_batch": batch_id, **repo_dict}
                    tr.update_in_db(db)
                    all_repos_cntr.update()
            except KeyboardInterrupt as e:
                log.warn(f"stopping batch worker pool...")
                executor.stop()
                for rf in ret_futures:
                    rf.cancel()
                log.warn(f"waiting for already started jobs to finish...")
                executor.join()
    finally:
        try:
            node_q.put(None)
            receiver_exit = node_receiver.result(timeout=1)
        except (BrokenPipeError, KeyboardInterrupt) as e:
            pass