Exemple #1
0
def init_headless(directory, skip_obfuscation, output, parse_libraries, emails,
                  debug_mode, user_commits, reponame, skip, commit_size_limit,
                  file_size_limit):
    # Initialize logger
    logger = logging.getLogger("main")
    if debug_mode:
        logger.setLevel(logging.DEBUG)
        fh = logging.FileHandler('extractor_debug_info.log')
        fh.setLevel(logging.DEBUG)
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        logger.addHandler(fh)
    else:
        logger.setLevel(logging.WARNING)

    repo = git.Repo(directory)
    ar = AnalyzeRepo(repo)
    q = Questions()

    print('Initialization...')
    for branch in repo.branches:
        ar.create_commits_entity_from_branch(branch.name)
    ar.flag_duplicated_commits()
    ar.get_commit_stats()
    print('Analysing the master branch..')
    ar.analyse_master_user_commits(user_commits)
    print('Creating the repo entity..')
    r = ar.create_repo_entity(directory)

    r.local_usernames = list(set(r.local_usernames + emails))
    print('Setting the local user_names ::', r.local_usernames)
    r.repo_name = reponame

    if parse_libraries:
        # build authors from the the email list provided
        # we are provided only emails in the headless mode
        # TODO! Support both name -> email and email formats
        author_emails = []
        for email in r.local_usernames:
            author_emails.append(email)

        if author_emails:
            al = AnalyzeLibraries(r.commits, author_emails,
                                  repo.working_tree_dir, skip,
                                  commit_size_limit, file_size_limit)
            libs = al.get_libraries()

            # combine repo stats with libs used
            for i in range(len(r.commits)):
                c = r.commits[i]
                if c.hash in libs.keys():
                    r.commits[i].libraries = libs[c.hash]

    if not skip_obfuscation:
        r = obfuscate(r)

    er = ExportResult(r)
    er.export_to_json_headless(output)
    print('Successfully analysed the repo ==>' + reponame)
Exemple #2
0
def init_headless(directory, skip_obfuscation, output, parse_libraries, emails,
                  user_commits, reponame):
    repo = git.Repo(directory)
    ar = AnalyzeRepo(repo)
    q = Questions()

    print('Initialization...')
    for branch in repo.branches:
        ar.create_commits_entity_from_branch(branch.name)
    ar.flag_duplicated_commits()
    ar.get_commit_stats()
    print('Analysing the master branch..')
    ar.analyse_master_user_commits(user_commits)
    print('Creating the repo entity..')
    r = ar.create_repo_entity(directory)

    r.local_usernames = list(set(r.local_usernames + emails))
    print('Setting the local user_names ::', r.local_usernames)
    r.repo_name = reponame

    if parse_libraries:
        # build authors from the the email list provided
        # we are provided only emails in the headless mode
        # TODO! Support both name -> email and email formats
        author_emails = []
        for email in r.local_usernames:
            author_emails.append(email)

        if author_emails:
            al = AnalyzeLibraries(r.commits, author_emails,
                                  repo.working_tree_dir)
            libs = al.get_libraries()

            # combine repo stats with libs used
            for i in range(len(r.commits)):
                c = r.commits[i]
                if c.hash in libs.keys():
                    r.commits[i].libraries = libs[c.hash]

    if not skip_obfuscation:
        r = obfuscate(r)

    er = ExportResult(r)
    er.export_to_json_headless(output)
    print('Successfully analysed the repo ==>' + reponame)
Exemple #3
0
def init_headless(directory,
                  skip_obfuscation,
                  output,
                  parse_libraries,
                  emails,
                  debug_mode,
                  user_commits,
                  reponame,
                  skip,
                  commit_size_limit,
                  file_size_limit,
                  seed,
                  timeout_seconds=600):
    # Initialize logger
    logger = logging.getLogger("main")
    if debug_mode:
        logger.setLevel(logging.DEBUG)
        fh = logging.FileHandler('extractor_debug_info.log')
        fh.setLevel(logging.DEBUG)
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        logger.addHandler(fh)
    else:
        logger.setLevel(logging.WARNING)

    repo = git.Repo(directory)
    ar = AnalyzeRepo(repo)
    q = Questions()
    timer = Timer(timeout_seconds, timeout)
    timer.start()
    # Use a context manager with signal to measure seconds, and timeout
    try:
        print('Initialization...')
        for branch in repo.branches:
            ar.create_commits_entity_from_branch(branch.name)
        ar.flag_duplicated_commits()
        ar.get_commit_stats()
        print('Analysing the master branch..')
        ar.analyse_master_user_commits(user_commits)
        print('Creating the repo entity..')
        r = ar.create_repo_entity(directory)

        r.local_usernames = list(set(r.local_usernames + emails))
        MAX_EMAIL_LIMIT = 50
        if len(r.local_usernames) > MAX_EMAIL_LIMIT:
            print("Email count (" + str(len(r.local_usernames)) +
                  ") for this repo exceeds the limit of " +
                  str(MAX_EMAIL_LIMIT) + " emails.")
            r.local_usernames = r.local_usernames[0:MAX_EMAIL_LIMIT]
        print('Setting the local user_names ::', r.local_usernames)
        r.repo_name = reponame

        if parse_libraries and len(ar.commit_list) > 0:
            # build authors from the the email list provided
            # we are provided only emails in the headless mode
            # TODO! Support both name -> email and email formats
            author_emails = []
            for email in r.local_usernames:
                author_emails.append(email)

            if author_emails:
                al = AnalyzeLibraries(r.commits, author_emails,
                                      repo.working_tree_dir, skip,
                                      commit_size_limit, file_size_limit)
                libs = al.get_libraries()

                # combine repo stats with libs used
                for i in range(len(r.commits)):
                    c = r.commits[i]
                    if c.hash in libs.keys():
                        r.commits[i].libraries = libs[c.hash]

            # new email detection
            try:
                emails_v2 = match_emails(directory, seed)
                r.emails_v2 = emails_v2["emails"]
            except:
                r.emails_v2 = list()

        if not skip_obfuscation:
            r = obfuscate(r)

        er = ExportResult(r)
        er.export_to_json_headless(output)
        print('Successfully analysed the repo ==>' + reponame)
    except KeyboardInterrupt:
        print("{} timeouted after {} seconds.".format(repo.working_dir,
                                                      timeout_seconds))
        print("Deleting", repo.working_dir)
        try:
            shutil.rmtree(repo.working_dir)
        except (PermissionError, NotADirectoryError, Exception) as e:
            print("Error when deleting {}. Message: {}".format(
                repo.working_dir, str(e)))
    finally:
        timer.cancel()