else: k += 1 # clear pos del pos[:] return results def convert_day_to_period(day): _d = datetime.strptime(day, '%Y-%m-%d') day_as_period = datetime.strftime(_d, '%m-%Y') return day_as_period if __name__ == '__main__': logger = loggingcfg.initialize_logger('SZZ') header = ["user_id", "project", "day", "num_commits", "num_file_touches", "num_src_file_touches", "num_files_touched", "num_src_files_touched", "loc_added", "src_loc_added", "loc_deleted", "src_loc_deleted", "num_bugs_induced", "num_bug_inducing_commits"] idx_user_id = header.index("user_id") idx_project = header.index("project") idx_day = header.index("day") idx_num_commits = header.index("num_commits") idx_num_file_touches = header.index("num_file_touches") idx_num_files_touched = header.index("num_files_touched") idx_loc_added = header.index("loc_added") idx_loc_deleted = header.index("loc_deleted") idx_num_src_file_touches = header.index("num_src_file_touches") idx_num_src_files_touched = header.index("num_src_files_touched")
from github.GithubException import RateLimitExceededException from github.GithubException import GithubException from github.GithubException import BadCredentialsException from github.GithubException import UnknownObjectException from githubutils.BaseGithubThreadedExtractor import BaseGitHubThreadedExtractor from githubutils.Tokens import Tokens from githubutils.NoAvailableTokenException import NoAvailableTokenException from loggingcfg import initialize_logger import numpy as np from numba import jit, prange import pandas as pd import loggingcfg logger = loggingcfg.initialize_logger('SZZ-EXTRACTOR') class IssuesAndCommentExtractor(BaseGitHubThreadedExtractor): __ISSUES_COLUMN_NAMES = ["SLUG", "ID", "NUMBER", "STATE", "CREATED_AT", "CLOSED_AT", "CREATED_BY_LOGIN", "CLOSED_BY_LOGIN", "ASSIGNEE_LOGIN", "TITLE", "NUM_COMMENTS", "LABELS", "IS_PL"] __COMMENTS_COLUMN_NAMES = ["SLUG", "ISSUE_ID", "ISSUE_NUMBER", "COMMENT_ID", "BODY", "CREATED_AT", "UPDATED_AT", "USER_LOGIN", "USER_ID"] @jit(parallel=True) def __to_df(self, issues, slug, g, issue_data, comment_data): repo = g.get_repo(slug) for i in prange(0, np.size(issues)): logger.debug("Looking for issue number %d", issues[i])
def random_selection(population, conf, seed): random.seed(seed) random.shuffle(population) k = sample_size(population_size=len(population), confidence_level=conf, confidence_interval=1.0) if len(population) < k: k = len(population) selection = random.sample(population, k) return selection if __name__ == '__main__': import logging logger = loggingcfg.initialize_logger(name='SZZ:PROLIFICS', console_level=logging.INFO) prolific_infile = "prolifics.txt" #prolific_ids = None prolific_prj = None projects_outfile = "project-list" seed = 895 conf_level = .95 subsample = False processed_ids_filename = "tmp/prolific/temp.ids" processed_prj_filename = "tmp/prolific/temp.projects" seen_ids = set() try: opts, args = getopt.getopt(sys.argv[1:], "hi:o:c:s:r", ["in=", "out=", "conflev=", "seed=", "random", "help"])
from orm.tables import Commit, Repo from orm.issue_comments import IssueComment regex = r'(\S+\/\S+)(#\d+|@\d+)' def replace(_str): _ref = _str.replace("(", "") _ref = _ref.replace(")", "") _ref = _ref.replace("[", "") _ref = _ref.replace("]", "") return _ref if __name__ == '__main__': logger = initialize_logger() session = SessionWrapper.new(init=True) logger.info("Connected to db") logger.info("Retrieving comments from commits") commit_messages = session.query( Repo.slug, Commit.message).filter(Repo.id == Commit.repo_id).all() logger.info("Extracting cross refs from commit messages") idx = 0 for cm in commit_messages: cross_refs = re.finditer(regex, cm.message, re.MULTILINE) for ref in cross_refs: if cm.slug not in ref.group(0): _ref = replace(ref.group(0)) cref = CrossReference(from_slug=cm.slug,
prj_writer.writerow(row) prj_writer.close() logger.info("Done writing %s." % prj_outfile) lang_writer = CsvWriter(csv_file=lang_outfile, mode='w') """ - user_language_date_totalcommits.csv #user_id;language;date;num_commits;num_file_touches;num_files_touched;loc_added;loc_deleted #2;c;2013-09-29;1;1;1;8;0 """ lang_header = ['user_id', 'language', 'day', 'num_commits', 'num_src_file_touches', 'num_src_files_touched', 'src_loc_added', 'src_loc_deleted'] lang_writer.writerow(lang_header) for row in lang_rows: lang_writer.writerow(row) lang_writer.close() logger.info("Done writing %s." % lang_outfile) if __name__ == '__main__': loggingcfg.initialize_logger() target = os.path.abspath(sys.argv[1] + os.path.sep + 'idm' + os.path.sep + 'dict' + os.path.sep + 'aliasMap.dict') if os.path.getsize(target) > 0: with open(target, "rb") as f: unpickler = pickle.Unpickler(f) alias_map = unpickler.load() bc = BasicFileTypeClassifier() export(sys.argv[2:], alias_map, bc)
except Exception as e: traceback.print_exc(e) logger.error(comment, e) continue session.commit() logger.info("New comments added to the database: %s" % str(idx - 1)) if __name__ == '__main__': project_file = None issue_file = None comment_file = None into_db = False logger = initialize_logger(name="SZZ:ISSUES_COMMENTS") try: if not sys.argv[1:]: raise getopt.GetoptError( 'No arguments passed from the command line. See help instructions.' ) opts, args = getopt.getopt(sys.argv[1:], "hf:i:c:", ["from=", "issues=", "comments=", "help"]) for opt, arg in opts: if opt in ("-h", "--help"): print( 'Usage:\n szzExtractIssuesAndComments.py -f|--from=<file> -i|--issues=<file> -c|--comments=<file>' ) sys.exit(0) elif opt in ("-f", "--from"):
if os.path.exists(dest): logger.info( 'Project {0} already available locally in {1}, performing an update.' .format(slug, dest)) try: RepoCloner.pull(dest) RepoCloner.update_submodules(dest) except: logger.error( "Unknown error updating git repo in folder %s" % dest) else: RepoCloner.clone(slug, destination_dir) logger.info('Project repository {0} cloned into {1}'.format( slug, s2f)) RepoCloner.update_submodules(dest) if symlink_dir: try: sym = os.path.join(symlink_dir, s2f) os.symlink(dest, sym) except FileExistsError: logger.debug('Symlink already existing.') pass logger.info('Done.') if __name__ == '__main__': logger = initialize_logger(name='SZZ:CLONE') start(sys.argv[1:])
for ipr in issuepr_messages.itertuples(): cross_refs = re.finditer(regex, getattr(ipr, "BODY"), re.MULTILINE) slug = getattr(ipr, "SLUG") for ref in cross_refs: if slug not in ref.group(0): _ref = __replace(ref.group(0)) cross_references.append([slug, _ref, "issue/pr"]) logger.info("Saving cross references") df = pd.DataFrame(cross_references, columns=["SLUG", "REF", "TYPE"]) df.to_csv(os.path.join(input_dir_path, "cross_references.csv"), index=False) if __name__ == '__main__': logger = initialize_logger(name="CROSS_REF") help_message = 'Usage:\n extractor.py -in|--input=<input_dir> -cp|--commit_pattern=<commit_pattern_file> -ip|--issues_pattern=<issues_pattern_file>' input_dir = None commit_pattern = "*blamed_commit.csv" issues_pattern = "*comments.csv" try: if not sys.argv[1:]: raise getopt.GetoptError( 'No arguments passed from the command line. See help instructions.' ) opts, args = getopt.getopt( sys.argv[1:], "H:in:cp:ip", ["input=", "commit_pattern=", "issues_pattern=", "help"]) for opt, arg in opts: if opt in ("-h", "--help"):
from typing import Dict from mpi4py import MPI from typing import List import time import traceback import hashlib import pandas as pd import itertools from utils import utility comm = MPI.COMM_WORLD rank = comm.Get_rank() mpisize = comm.Get_size() log = loggingcfg.initialize_logger('SZZ-MPI', console_level=logging.INFO) class Szz: __COMMIT_COLUMNS = [ "SLUG", "SHA", "TIMESTAMP", "AUTHOR_ID", "COMMITTER_ID", "MESSAGE", "NUM_PARENTS", "NUM_ADDITIONS", "NUM_DELETIONS", "NUM_FILES_CHANGED", "FILES", "SRC_LOC_ADDED", "SRC_LOC_DELETED", "NUM_SRC_FILES_TOUCHED", "SRC_FILES" ] def __init__(self, repo_path: str, issues_file_path: str, output_folder: str, valid_labels: List[str],
import datetime import logging import os import pickle import sys import time from time import strftime from utils import utility import pandas import getopt import loggingcfg from activityclassifier import BasicFileTypeClassifier logger = loggingcfg.initialize_logger(name='RESULT-EXPORT', console_level=logging.INFO) def replace_alias(aliases, author_id): return aliases[author_id] def parse_timestamp(date, time_unit): gb = None date_d1 = time.strptime('1990-01-01', "%Y-%m-%d") date_d2 = time.strptime(str(date).split(' ')[0], "%Y-%m-%d") year_d1 = int((strftime("%Y", date_d1))) year_d2 = int((strftime("%Y", date_d2))) if time_unit == 'week':