Exemple #1
0
def analyze(args):
    collector = WST_ArangoTreeCollector(
        args.repo_url,
        workers=args.workers,
        database_conn=args.db,
        commit_sha=args.target_commit,
    )
    collector.setup()
    log.debug(f"Set up collector: {collector}")

    if args.interactive_debug:
        log.warn("Starting debugging:")
        bpdb.set_trace()

    try:
        collector.collect_all(overwrite_incomplete=args.overwrite_incomplete)
    except RepoExistsError as e:
        if args.skip_exists:
            log.warn(
                f"Skipping collection since repo document already present for commit {collector._current_commit_hash}"
            )
            return
        else:
            raise
    except Exception as e:
        log.crit(f"{collector} run failed.")
        raise e
 def setup(self):
     """Clone the repo, connect to the DB, create working directories, etc."""
     self._connect_db()
     repo = self._get_git_repo()
     if self._current_commit is None:
         log.warn(
             f"Deleting and re-cloning repo in {self._local_repo_path}")
         try:
             shutil.rmtree(self._local_repo_path)
             repo = self._get_git_repo()
         except Exception as e:
             log.error(f"Failed to repair repository: {type(e)}: {e}")
             raise e
     # to _target_commit if set
     if self._target_commit and self._target_commit != self._current_commit_hash:
         log.info(f"Checking out commit {self._target_commit}...")
         try:
             commit = repo.get(self._target_commit)
             log.debug(f"target commit {commit}")
             # commit might not exist for a variety of reasons (need to fetch, DNE, corrupt, etc)
             repo.checkout_tree(commit.tree)
             repo.head.set_target(commit.id)
         except Exception as e:
             raise e
         log.info(
             f"Repo at {self._local_repo_path} now at {self._current_commit_hash}"
         )
     elif self._target_commit and self._target_commit == self._current_commit_hash:
         log.debug(
             f"Repo in {self._local_repo_path} is already at {self._target_commit}"
         )
Exemple #3
0
def __main__():
    parser = argparse.ArgumentParser()

    parser.add_argument("query", type=str, help="S-exp query to execute")
    parser.add_argument("-v",
                        "--verbose",
                        help="Increase output verbosity",
                        action="store_true")
    parser.add_argument("--node-text",
                        help="Show the text content of the matched nodes",
                        action="store_true")
    args = parser.parse_args()

    if args.verbose:
        log.setLevel(log.DEBUG)
        log.debug("Verbose logging enabled.")

    parsed_s_query = sexp.parseString(args.query)

    log.debug(parsed_s_query)

    r = find_nodes_by_query(parsed_s_query)
    rl = []
    for n in r:
        log.info(f"{n} in {n.file.fetch()}")
        n_sexp = node_as_sexp(n, maxdepth=3, indent=2, show_start_coords=True)
        log.info(f"{n_sexp}")
        if args.node_text:
            log.info(f"{n.text.fetch()}")
        rl.append(n)

    log.info(f"{len(rl)} results returned")
Exemple #4
0
def database_init(args):
    client = ArangoClient(hosts=strip_url(args.db))
    p = urlparse(args.db)
    odb = client.db(p.path[1:], username=p.username, password=p.password)

    if args.delete:
        log.warn(f"deleting all data ...")
        # deleting old stuff could take awhile
        jobs = []
        db = odb.begin_async_execution()

        jobs.append(
            db.delete_graph(tree_models._graph_name, ignore_missing=True))
        for c in _db_collections:
            jobs.append(db.delete_collection(c, ignore_missing=True))
        for c in _db_edgecollections:
            jobs.append(db.delete_collection(c, ignore_missing=True))

        jt_wait = len(jobs)
        while len(jobs) > 0:
            time.sleep(1)
            for j in jobs:
                if j.status() == 'done':
                    jobs.remove(j)
            if jt_wait != len(jobs):
                log.debug(f"delete: waiting on {len(jobs)} jobs to finish ...")
                jt_wait = len(jobs)

    # back to non-async
    db = odb

    log.info(f"Creating collections ...")

    colls = {}
    for cn in _db_collections:
        if db.has_collection(cn):
            colls[cn] = db.collection(cn)
        else:
            colls[cn] = db.create_collection(cn, user_keys=True)
    for cn in _db_edgecollections:
        if db.has_collection(cn):
            colls[cn] = db.collection(cn)
        else:
            colls[cn] = db.create_collection(cn, user_keys=True, edge=True)

    graph = None
    if not db.has_graph(tree_models._graph_name):
        graph = db.create_graph(tree_models._graph_name)
    else:
        graph = db.graph(tree_models._graph_name)
    edgedefs = {}

    for gk, gv in _graph_edge_definitions.items():
        if not graph.has_edge_definition(gv['edge_collection']):
            log.debug(f"Added graph edges {gv}")
            edgedefs[gk] = graph.create_edge_definition(**gv)
    def collect_all(self,
                    existing_node_q=None,
                    overwrite_incomplete: bool = False):
        """Creates every node down the tree for this repo"""
        # create the main Repos
        self._tree_repo = WSTRepository(
            type='git',
            url=self.repo_url,
            path=self._url_path,
            analyzed_time=int(time.time()),
            wst_status="started",
        )
        # self._coll['wstrepos'].insert(nr.__dict__)
        try:
            self._tree_repo.insert_in_db(self._db)
        except arango.exceptions.DocumentInsertError as e:
            if e.http_code == 409:
                existing_repo = WSTRepository.get(self._db,
                                                  self._tree_repo._key)
                if overwrite_incomplete and existing_repo.wst_status != "completed":
                    log.warn(
                        f"Overwriting prior WSTRespository, status was '{existing_repo.wst_status}'"
                    )
                    self._tree_repo.update_in_db(self._db)
                else:
                    raise RepoExistsError(f"Already present: {existing_repo}")
            else:
                raise e

        # attempt to find an existing commit in the db:
        if not (commit := WSTCommit.get(self._db, self._current_commit_hash)):
            _cc = self._current_commit
            self._wst_commit = WSTCommit(
                _key=_cc.hex,
                commit_time=_cc.commit_time,
                commit_time_offset=_cc.commit_time_offset,
                parent_ids=[str(i) for i in _cc.parent_ids],
                tree_id=str(_cc.tree_id),
            )
            log.debug(f"Inserting {self._wst_commit}")
            self._wst_commit.insert_in_db(self._db)
Exemple #6
0
def _tqdm_node_receiver(q, en_manager):
    """This is the cross-process aggregator for non-required data

    Even without this process the collection and analysis should run normally.
    It's mostly just used for debugging and informational output.
    """
    try:
        log.debug(f"start counting db inserts...")
        n = 0
        cache_stats = {
            "text_lfu_hit": 0,
            "text_lfu_miss": 0,
        }
        dedup_stats = {}
        cntr = en_manager.counter(desc="writing to db",
                                  position=1,
                                  unit='docs',
                                  autorefresh=True)
        # with tqdm(desc="writing documents to db", position=1, unit='docs', unit_scale=True) as tbar:
        while (nc := q.get()) is not None:
            if type(nc) == int:
                n += nc
                cntr.update(nc)
            elif nc[0] == "cache_stats":
                for k, v in nc[1].items():
                    cache_stats[k] += v
            elif nc[0] == "dedup_stats":
                if nc[1] not in dedup_stats:
                    dedup_stats[nc[1]] = 0
                dedup_stats[nc[1]] += nc[2]
            else:
                log.error(
                    f"node receiver process got invalid data sent of type {type(nc)}"
                )
        log.info(f"stopped counting nodes, total documents inserted: {n}")
        cache_text_lfu_ratio = cache_stats["text_lfu_hit"] / (
            cache_stats["text_lfu_miss"] or 1)
        log.debug(
            f"text_lfu cache stats: ratio {cache_text_lfu_ratio}, hit {cache_stats['text_lfu_hit']}"
        )
        return True
Exemple #7
0
def build_dask_dataframe_for_file(lang: TreeSitterAutoBuiltLanguage,
                                  file: str):
    tree = lang.parse_file(file)
    cur = tree.walk()
    # cur = TreeSitterCursorIterator(cur, nodefilter=lambda x: x.is_named)
    cur = TreeSitterCursorIterator(cur)

    log.debug(f"{cur}")

    cols = ["repo", "file", "x1", "y1", "x2", "y2", "type", "text"]

    nl = []

    for node in cur:
        # log.trace(log.debug, f"{node.type}: {node.text.tobytes().decode('utf-8')}")
        nl.append([
            -1, file, *node.start_point, *node.end_point, node.type,
            node.text.tobytes()
        ])

    ndb = db.from_sequence(nl)
    ndf = ndb.to_dataframe(columns=cols)

    return ndf.persist().repartition(1)
def __main__():
    parser = argparse.ArgumentParser()

    parser.add_argument("--nworkers",
                        type=int,
                        help="number of workers",
                        default=os.cpu_count())
    parser.add_argument("--jobitems",
                        type=int,
                        help="items in a single job",
                        default=200)
    parser.add_argument("--njobs",
                        type=int,
                        help="number of total jobs to complete",
                        default=200)
    parser.add_argument("--itemtime",
                        type=float,
                        help="time taken per item in a job",
                        default=0.1)
    parser.add_argument("-v",
                        "--verbose",
                        help="Increase output verbosity",
                        action="store_true")
    args = parser.parse_args()

    if args.verbose:
        log.setLevel(log.DEBUG)
        log.debug("Verbose logging enabled.")

    with ProcessPool(max_workers=args.nworkers) as executor:
        multiprogress.main_proc_setup()
        multiprogress.start_server_thread()

        en_manager_proxy = multiprogress.get_manager_proxy()
        en_manager = multiprogress.get_manager()

        ret_futures = []
        # log.debug(f"counter_generator: {repr(counter_generator)}")
        log.info(f"Starting jobs...")
        for i in range(args.njobs):
            ret_futures.append(
                executor.schedule(
                    slow_worker,
                    (i, args.jobitems, args.itemtime, en_manager_proxy)))
        log.info(f"Waiting for jobs to complete...")
        cntr_all_jobs = en_manager.counter(desc="all jobs",
                                           total=args.njobs,
                                           color='blue')
        log.debug(f"cntr_all_jobs: {repr(cntr_all_jobs)}")
        for f in futures.as_completed(ret_futures):
            f.result()
            log.debug(f"finished a job!")
            cntr_all_jobs.update()
        log.info(f"All jobs completed!")
Exemple #9
0
def __main__():
    parser = argparse.ArgumentParser()

    parser.add_argument("--db",
                        "--database",
                        type=str,
                        help="Database connection string",
                        default=os.environ.get(
                            'WST_DB_URI', "http://*****:*****@localhost:8529/wst"))
    parser.add_argument("-v",
                        "--verbose",
                        help="Increase output verbosity",
                        action="store_true")
    parser.set_defaults(en_manager=enlighten.get_manager())
    subcmds = parser.add_subparsers(title="Collector commands")

    # analysis
    cmd_analyze = subcmds.add_parser('analyze',
                                     aliases=['add', 'a'],
                                     help="Analyze repositories")
    cmd_analyze.set_defaults(func=analyze)
    cmd_analyze.add_argument("repo_url",
                             type=str,
                             help="URI for cloning the repository")
    cmd_analyze.add_argument(
        "-w",
        "--workers",
        type=int,
        help=
        "Number of workers to use for processing files, default: os.cpu_count()",
        default=None)
    cmd_analyze.add_argument(
        "--skip-exists",
        "--skip-existing",
        action="store_true",
        help=
        "Skip the analysis if the repo document already exists in the database"
    )
    cmd_analyze.add_argument(
        "--interactive-debug",
        action="store_true",
        help="Start the interactive debugger after repo setup")
    cmd_analyze.add_argument(
        "--overwrite-incomplete",
        action="store_true",
        help="Overwrite existing but incomplete / unfinished data in the DB")
    cmd_analyze.add_argument(
        "-t",
        "--target-commit",
        type=str,
        help="Checkout and analyze a specific commit from the repo",
        default=None)
    # batch analysis
    cmd_batch = subcmds.add_parser(
        'batch',
        aliases=['addbatch', 'addmulti'],
        help="Analyze multiple repos from a JSON specification list")
    set_batch_analyze_args(cmd_batch)
    # delete data selectively
    cmd_delete = subcmds.add_parser('delete',
                                    aliases=['del'],
                                    help="Delete tree data selectively")
    cmd_delete.set_defaults(func=delete)
    cmd_delete.add_argument(
        "which_repo",
        type=str,
        help="URI or commit SHA for which repo's data to delete")
    # db setup
    cmd_db = subcmds.add_parser('db',
                                aliases=['database'],
                                help="Manage the database")
    subcmds_db = cmd_db.add_subparsers(title="Manage the database")
    cmd_db_init = subcmds_db.add_parser('initialize',
                                        aliases=['init', 'setup'],
                                        help="Set up the database")
    cmd_db_init.set_defaults(func=database_init)
    cmd_db_init.add_argument(
        "-d",
        "--delete",
        help="Delete any existing data in the database",
        action="store_true",
    )
    args = parser.parse_args()

    if args.verbose:
        log.setLevel(log.DEBUG)
        log.debug("Verbose logging enabled.")

    log.info(f"DB connection: {desensitize_url(args.db)}")

    if 'func' not in args:
        log.warn(f"Please supply a valid subcommand!")
        return

    try:
        args.func(args)
    except KeyboardInterrupt as e:
        log.warn(f"Stopping all child processes...")
        cur_proc = psutil.Process()
        children = cur_proc.children(recursive=True)
        for c in children:
            os.kill(c.pid, signal.SIGINT)
        psutil.wait_procs(children, timeout=5)
        children = cur_proc.children(recursive=True)
        for c in children:
            c.terminate()
        raise e
Exemple #10
0
    args = parser.parse_args()
    if args.verbose:
        log.setLevel(log.DEBUG)

    client = ArangoClient(hosts=strip_url(args.db))
    p = urlparse(args.db)
    db = client.db(p.path[1:], username=p.username, password=p.password)

    lang = TreeSitterAutoBuiltLanguage(args.language)

    tree = lang.parse_file(args.file_path)

    cur = tree.walk()
    cur = TreeSitterCursorIterator(cur, nodefilter=lambda x: True)

    log.debug(cur)

    root = cur.peek()

    test_id = str(uuid.uuid4().hex)

    repo = WSTRepository(
        _key="wst0test0461b1c841f897cbd952354370471a64",
        type='test',
        url=f"wst.tests.insertion/{test_id}",
        commit="wst0test0461b1c841f897cbd952354370471a64",
        path=f"wst/tests/{test_id}",
    )
    repo.insert_in_db(db)
    file = WSTFile(
        _key="wst0test0461b1c841f897cbd952354370471a64-0",
 def _local_repo_path(self):
     cachedir = LocalCache.get_local_cache_dir() / 'collector_repos'
     if not cachedir.exists():
         cachedir.mkdir(mode=0o770, exist_ok=True)
         log.debug(f"created dir {cachedir}")
     return cachedir.joinpath(self._url_path)
Exemple #12
0
def batch_analyze(args):
    repo_list_file = Path(args.repo_list_file)
    if not repo_list_file.exists():
        log.err(f"Input file not found: {args.repo_list_file}")
    try:
        with repo_list_file.open('r') as f:
            repolist = json.load(f)
    except Exception as e:
        log.err(f"Failed to read repo list file")
        raise

    client = ArangoClient(hosts=strip_url(args.db))
    p = urlparse(args.db)
    db = client.db(p.path[1:], username=p.username, password=p.password)
    batch_id = uuid.uuid4().hex
    log.info(f"Batch ID {batch_id}")
    _mp_manager = Manager()
    node_q = _mp_manager.Queue()

    log.debug(f"checking {len(repolist)} items in repo list")

    try:
        multiprogress.main_proc_setup()
        multiprogress.start_server_thread()
        en_manager_proxy = multiprogress.get_manager_proxy()
        en_manager = multiprogress.get_manager()
        node_receiver = _tqdm_node_receiver(node_q, en_manager_proxy)

        with ProcessPool(max_workers=args.jobs) as executor:
            ret_futures = []
            all_repos_sched_cntr = en_manager.counter(desc="adding repo jobs",
                                                      total=len(repolist),
                                                      unit='repos')
            for repo in repolist:
                ret_futures.append(
                    executor.schedule(repo_worker, (repo, node_q), {
                        'workers': args.workers,
                        'database_conn': args.db
                    }))
                all_repos_sched_cntr.update()
            all_repos_sched_cntr.close()
            all_repos_cntr = en_manager.counter(desc="repos in batch",
                                                total=len(repolist),
                                                unit='repos',
                                                autorefresh=True)
            try:
                for r in futures.as_completed(ret_futures):
                    try:
                        repo_dict, tr = r.result()
                    except RepoExistsError as e:
                        if args.skip_exists:
                            log.debug(f"{e}")
                            all_repos_cntr.update()
                            continue
                        else:
                            log.err(f"{e}")
                            raise e
                    # save the original repo data to the db as well:
                    tr.wst_extra = {"wst_batch": batch_id, **repo_dict}
                    tr.update_in_db(db)
                    all_repos_cntr.update()
            except KeyboardInterrupt as e:
                log.warn(f"stopping batch worker pool...")
                executor.stop()
                for rf in ret_futures:
                    rf.cancel()
                log.warn(f"waiting for already started jobs to finish...")
                executor.join()
    finally:
        try:
            node_q.put(None)
            receiver_exit = node_receiver.result(timeout=1)
        except (BrokenPipeError, KeyboardInterrupt) as e:
            pass
Exemple #13
0
            # link target probably not within our repo dir
            file.symlink['relative'] = None
        file_shake_256.update(str(target).encode())
        file.content_hash = file_shake_256.hexdigest(64)
    else:
        raise UnhandledGitFileMode(f"{file.path} mode is {oct(file.mode)}")

    try:
        file.insert_in_db(db)
        (wst_commit / file).insert_in_db(db)  # commit -> file
    except arango.exceptions.DocumentInsertError as e:
        if e.http_code == 409:
            # already exists: get it
            preexisting_file = WSTFile.get(db, file._key)
            if preexisting_file != file:
                log.debug(f"existing file: {preexisting_file}")
                log.debug(f"new file: {file}")
                if overwrite_errored_docs and preexisting_file.error:
                    log.warn(
                        f"Overwriting errored WSTFile, prior error: {preexisting_file.error}, new error: {file.error}"
                    )
                    file.update_in_db(db)
                    (wst_commit / file).insert_in_db(
                        db, overwrite=True)  # commit -> file
                else:
                    raise PrerequisiteStateInvalid(
                        f"WSTFile {file._key} already exists but has mismatched data"
                    )
            else:  # WSTFiles are equivalent, dedup
                (wst_commit / preexisting_file).insert_in_db(
                    db, overwrite=overwrite_errored_docs)