def __init__(self, browser='PhantomJS', *a, **kw): super(ApacheSpider, self).__init__(*a, **kw) self.log = logging.getLogger(__name__) if browser == "PhantomJS": self.driver = webdriver.PhantomJS() elif browser == "Chrome": options = webdriver.ChromeOptions() options.add_argument('--ignore-certificate-errors') options.add_argument("--test-type") self.driver = webdriver.Chrome(chrome_options=options) SessionWrapper.load_config('../../db/cfg/setup.yml') SessionWrapper.new(init=True)
def get_projects(): SessionWrapper.load_config('../db/cfg/setup.yml') session = SessionWrapper.new(init=False) projects = session.query(ApacheProject.name, ApacheProject.repository_url).filter_by(repository_type='git').all() return projects
if all_emails: resume_month = already_parsed_uid_project_month(aliases, p.name) liwc_errors = get_score_by_month(uid, p.name, all_emails, resume_month, nlon, nlon_model) del all_emails else: logger.debug( 'No emails from %s <%s> to project \'%s\' mailing lists' % (uid, alias_email_addresses, p.name)) logger.info('Done processing project %s' % p.name) if liwc_errors: return True return False if __name__ == '__main__': logger = logging_config.get_logger('big5_personality', console_level=logging.DEBUG) SessionWrapper.load_config('../db/cfg/setup.yml') session = SessionWrapper.new(init=True) if len(sys.argv) >= 2: tool = sys.argv[1] else: logger.error('Missing mandatory first param for tool: \'liwc15\' or \'p_insights\' expected') sys.exit(-1) if len(sys.argv) > 2 and sys.argv[2] == 'reset': reset_personality_table() try: """ boolean var storing presence of liwc errors """ liwc_errors = main() if tool == 'liwc15': if not liwc_errors:
def parse_commits(slug, repos_folder): contributors = {} counter = itertools.count(start=1) basic_classifier = BasicFileTypeClassifier() session = SessionWrapper.new() logger.info( 'Parsing commits, commit files, and developers from project %s' % slug) try: folder_path = os.path.abspath(os.path.join(repos_folder, slug)) min_commit = datetime.now(timezone.utc) max_commit = min_commit - timedelta(days=100 * 365) total_commits = 0 if not os.path.exists(folder_path): return slug try: db_repo = session.query(GithubRepository).filter_by( slug=slug).one() if db_repo.total_commits == 0: # first delete the empty row, likely produced by an interrupted execution # then raise exception to attempt a new parsing try: session.delete(db_repo) session.commit() except: logger.error( 'Error trying to delete empty row for repository %s' % slug) return raise exc.NoResultFound else: # the reason why we return here is to skip analyzing # again a repo in case of crashing exception that forces # the script to be run again logger.debug( 'Project %s seems to have been already processed, skipping' % slug) return slug except exc.NoResultFound: db_repo = GithubRepository(slug, min_commit, max_commit, total_commits) session.add(db_repo) session.commit() except exc.MultipleResultsFound: logger.warning(msg="Found multiple results querying for repo %s." % slug) pass git_repo = pygit2.Repository(folder_path) last = git_repo[git_repo.head.target] # Fetch all commits as an iterator, and iterate it for c in git_repo.walk(last.id, pygit2.GIT_SORT_TIME): commit = CommitWrapper(c) total_commits += 1 sha = commit.get_sha() authored_datetime = commit.get_authored_date() committed_datetime = commit.get_committed_date() if authored_datetime < min_commit: min_commit = authored_datetime if authored_datetime > max_commit: max_commit = authored_datetime (author_name, author_email) = commit.get_author() (author_name_l, author_email_l) = (author_name.lower(), author_email.lower()) (committer_name, committer_email) = commit.get_committer() (committer_name_l, committer_email_l) = (committer_name.lower(), committer_email.lower()) if (author_name_l, author_email_l) not in contributors: contributors[(author_name_l, author_email_l)] = next(counter) author_id = contributors[(author_name_l, author_email_l)] if (committer_name_l, committer_email_l) not in contributors: contributors[(committer_name_l, committer_email_l)] = next(counter) committer_id = contributors[(committer_name_l, committer_email_l)] parents = commit.get_parents() num_parents = len(parents) if not num_parents: continue message = commit.get_message().strip() try: db_commit = session.query(Commit).filter_by(sha=sha).one() continue # if already present, stop and go on analyzing the next one except exc.NoResultFound: diff = commit.get_diff(git_repo) loc_added = diff.stats.insertions loc_deleted = diff.stats.deletions num_files_touched = diff.stats.files_changed # get info about changes to src files in the new commit all_files, src_files, num_src_files_touched, src_loc_added, src_loc_deleted = \ CommitWrapper.get_src_changes(basic_classifier, diff) db_commit = Commit(db_repo.id, sha, authored_datetime, author_id, committer_id, message, num_parents, loc_added, loc_deleted, num_files_touched, all_files, src_loc_added, src_loc_deleted, num_src_files_touched, src_files) session.add(db_commit) # required to flush the pending data before adding to the CommitFiles table below session.commit() # parse changed files per diff for patch in diff: commit_file = os.path.basename(patch.delta.new_file.path) try: commit_file = session.query(CommitFiles).filter_by( commit_sha=sha, repo_slug=slug, file=commit_file).one() continue # if already present, stop and go on analyzing the next one except exc.NoResultFound: lang = basic_classifier.label_file(commit_file) loc_ins = 0 loc_del = 0 for hunk in patch.hunks: for hl in hunk.lines: if hl.origin == '-': loc_del -= 1 elif hl.origin == '+': loc_ins += 1 commit_file = CommitFiles(db_repo.id, db_repo.slug, sha, commit_file, loc_ins, loc_del, lang) session.add(commit_file) session.commit() for (name, email), user_id in sorted(contributors.items(), key=lambda e: e[1]): try: db_user = session.query(GithubDeveloper).filter( and_(GithubDeveloper.name == func.binary(name), GithubDeveloper.email == func.binary(email), GithubDeveloper.repo_id == db_repo.id)).one() except exc.NoResultFound: db_user = GithubDeveloper(db_repo.id, user_id, name, email) session.add(db_user) except exc.MultipleResultsFound: # Would this happens because we allow name aliases during mining? # Should we deal with it? And how? logger.warning( msg="Multiple entries for user \'{0}\' <{1}> in repo {2}". format(name, email, db_repo.slug)) db_repo.min_commit = min_commit db_repo.max_commit = max_commit db_repo.total_commits = total_commits session.add(db_repo) session.commit() logger.info('Done') return slug except Exception as e: logger.error(msg="{0}: unknown error:\t{1}".format(slug, e)) traceback.print_exc() finally: return slug