def print_to_disk(issues, results_folder): """ Print issues to file "issues-jira.list" in result folder :param issues: the issues to dump :param results_folder: the folder where to place "issues-jira.list" output file """ # construct path to output file output_file = os.path.join(results_folder, "issues-jira.list") log.info("Dumping output in file '{}'...".format(output_file)) # construct lines of output lines = [] for issue in issues: log.info("Current issue '{}'".format(issue["externalId"])) lines.append((issue["author"]["name"], issue["author"]["email"], issue["externalId"], issue["creationDate"], issue["externalId"], issue["type"])) for comment in issue["comments"]: lines.append(( comment["author"]["name"], comment["author"]["email"], comment["id"], comment["changeDate"], issue["externalId"], "comment" )) # write to output file csv_writer.write_to_csv(output_file, lines, append=True)
def print_to_disk(issues, results_folder): """ Print issues to file "issues.list" in result folder. This format is outdated but still used by the network library. TODO When the network library is updated, this method can be overwritten by "print_to_disk_new". :param issues: the issues to dump :param results_folder: the folder where to place "issues.list" output file """ # construct path to output file output_file = os.path.join(results_folder, "issues.list") log.info("Dumping output in file '{}'...".format(output_file)) # construct lines of output lines = [] for issue in issues: for event in issue["eventsList"]: lines.append((issue["number"], issue["state"], issue["created_at"], issue["closed_at"], issue["isPullRequest"], event["user"]["name"], event["user"]["email"], event["created_at"], "" if event["ref_target"] == "" else event["ref_target"]["name"], event["event"])) # write to output file csv_writer.write_to_csv(output_file, lines)
def print_to_disk(issues, results_folder): """ Print issues to file "issues-github.list" in the results folder. :param issues: the issues to dump :param results_folder: the folder where to place "issues-github.list" output file """ # construct path to output file output_file = os.path.join(results_folder, "issues-github.list") log.info("Dumping output in file '{}'...".format(output_file)) # construct lines of output lines = [] for issue in issues: for event in issue["eventsList"]: lines.append(( issue["number"], issue["title"], json.dumps(issue["type"]), issue["state_new"], json.dumps(issue["resolution"]), issue["created_at"], issue["closed_at"], json.dumps([]), # components event["event"], event["user"]["name"], event["user"]["email"], event["created_at"], event["event_info_1"], json.dumps(event["event_info_2"]))) # write to output file csv_writer.write_to_csv( output_file, sorted(set(lines), key=lambda line: lines.index(line)))
def print_to_disk_extr(issues, results_folder): """ Print issues to file "issues.list" in result folder :param issues: the issues to dump :param results_folder: the folder where to place "issues.list" output file """ # construct path to output file output_file = os.path.join(results_folder, "issues.list") log.info("Dumping output in file '{}'...".format(output_file)) # construct lines of output lines = [] for issue in issues: log.info("Current issue '{}'".format(issue["externalId"])) lines.append(( issue["externalId"], issue["state"], issue["creationDate"], issue["resolveDate"], False, ## Value of is.pull.request issue["author"]["name"], issue["author"]["email"], issue["creationDate"], "", ## ref.name "open" ## event.name )) lines.append(( issue["externalId"], issue["state"], issue["creationDate"], issue["resolveDate"], False, ## Value of is.pull.request issue["author"]["name"], issue["author"]["email"], issue["creationDate"], "", ## ref.name "commented" ## event.name )) for comment in issue["comments"]: lines.append(( issue["externalId"], issue["state"], issue["creationDate"], issue["resolveDate"], False, ## Value of is.pull.request comment["author"]["name"], comment["author"]["email"], comment["changeDate"], "", ## ref.name "commented" ## event.name )) # write to output file csv_writer.write_to_csv(output_file, lines, append=True)
def parse(mbox_name, results_folder, include_filepath, files_as_artifacts, reindex, append_result): """Parse the given mbox file with the commit information from the results folder. :param mbox_name: the mbox file to search in :param results_folder: the results folder for index and commit information :param include_filepath: indicator whether to use the 'file name' part of the artifact into account :param files_as_artifacts: indicator whether to search for files (base names) as artifacts :param reindex: force reindexing if True :param append_result: flag whether to append the results for the current mbox file to the output file """ # load mbox file mbox = mailbox.mbox(mbox_name) # create schema for text search analyzer = StandardAnalyzer( expression=r"[^\s,:\"']+" ) # split by whitespace, commas, colons, and quotation marks. schema = Schema(messageID=ID(stored=True), content=TEXT(analyzer=analyzer)) # create/load index (initialize if necessary) ix = __get_index(mbox, mbox_name, results_folder, schema, reindex) # extract artifacts from results folder artifacts = __get_artifacts(results_folder, files_as_artifacts) # parallelize execution call for the text search log.info("Start parsing...") num_cores = multiprocessing.cpu_count() csv_data = Parallel(n_jobs=num_cores - 1)( delayed(__parse_execute)(commit, schema, ix, include_filepath) for commit in artifacts) log.info("Parsing finished.") # re-arrange results result = [] if not append_result: result.append(('file', 'artifact', 'messageID')) for entry in csv_data: for row in entry: result.append(row) # determine ouput file filename = "mboxparsing" if files_as_artifacts: filename += "_file" if include_filepath: filename += "_filepath.list" else: filename += ".list" output_file = os.path.join(results_folder, filename) # Writes found hits to file. log.info("Writing results to file {}.".format(output_file)) csv_writer.write_to_csv(output_file, result, append=append_result) log.info("Parsing mbox file complete!")
def print_to_disk_gephi(issues, results_folder): """ Print issues to file "issues-jira-gephi-nodes.csv" and "issues-jira-gephi-edges.csv" in result folder. The files can be used to build dynamic networks in Gephi. :param issues: the issues to dump :param results_folder: the folder where to place the two output file """ # construct path to output file output_file_nodes = os.path.join(results_folder, "issues-jira-gephi-nodes.csv") output_file_edges = os.path.join(results_folder, "issues-jira-gephi-edges.csv") log.info("Dumping output in file '{}'...".format(output_file_nodes)) log.info("Dumping output in file '{}'...".format(output_file_edges)) # construct lines of output node_lines = [] edge_lines = [] node_lines.append(("Id", "Type")) edge_lines.append(("Source", "Target", "Timestamp", "Edgetype")) for issue in issues: node_lines.append((issue["externalId"], "Issue")) node_lines.append((issue["author"]["name"], "Person")) edge_lines.append((issue["author"]["name"], issue["externalId"], issue["creationDate"], "Person-Issue")) for comment in issue["comments"]: node_lines.append((comment["id"], "Comment")) node_lines.append((comment["author"]["name"], "Person")) edge_lines.append((issue["externalId"], comment["id"], comment["changeDate"], "Issue-Comment")) edge_lines.append((comment["author"]["name"], comment["id"], ["changeDate"], "Person-Comment")) # write to output file csv_writer.write_to_csv(output_file_edges, edge_lines, append=True) csv_writer.write_to_csv(output_file_nodes, node_lines, append=True)
def print_to_disk(issues, results_folder): """Print issues to file 'issues.list' in result folder :param issues: the issues to dump :param results_folder: the folder where to place 'issues.list' output file """ # construct path to output file output_file = os.path.join(results_folder, "issues.list") log.info("Dumping output in file '{}'...".format(output_file)) # construct lines of output lines = [] for issue in issues: for event in issue["eventsList"]: lines.append((issue["number"], issue["state"], issue["created_at"], issue["closed_at"], issue["isPullRequest"], event["user"]["name"], event["user"]["email"], event["created_at"], "" if event["ref_target"] == "" else event["ref_target"]["name"], event["event"])) # write to output file csv_writer.write_to_csv(output_file, lines)
def print_to_disk(issues, results_folder): """ Print issues to file "issues.list" in result folder. This format is outdated but still used by the network library. TODO When the network library is updated, this method can be overwritten by "print_to_disk_new". :param issues: the issues to dump :param results_folder: the folder where to place "issues.list" output file """ # construct path to output file output_file = os.path.join(results_folder, "issues-github.list") log.info("Dumping output in file '{}'...".format(output_file)) # construct lines of output lines = [] for issue in issues: for event in issue["eventsList"]: lines.append(( issue["number"], issue["title"], json.dumps(issue["type"]), issue["state_new"], json.dumps(issue["resolution"]), issue["created_at"], issue["closed_at"], json.dumps([]), # components event["event"], event["user"]["name"], event["user"]["email"], event["created_at"], event["event_info_1"], json.dumps(event["event_info_2"]) )) # write to output file csv_writer.write_to_csv(output_file, lines)
def print_to_disk_new(issues, results_folder): """ Print issues to file "issues_new.list" in result folder. This file has a consistent format to the "bugs-jira.list" file. TODO When the network library is updated, this is the format which shall be used. :param issues: the issues to dump :param results_folder: the folder where to place "issues.list" output file """ # construct path to output file output_file = os.path.join(results_folder, "new_format.list") log.info("Dumping output in file '{}'...".format(output_file)) # construct lines of output lines = [] for issue in issues: for event in issue["eventsList"]: lines.append(( issue["number"], issue["title"], issue["type"], issue["state_new"], issue["resolution"], issue["created_at"], issue["closed_at"], [], # components event["event"], event["user"]["name"], event["user"]["email"], event["created_at"], event["event_info_1"], event["event_info_2"])) # write to output file csv_writer.write_to_csv(output_file, lines)
def run_postprocessing(conf, resdir, backup_data): """ Runs the postprocessing for the given parameters, that is, read the disambiguation file of the project and replace all author names and e-mail addresses in all other .list files according to the disambiguation file. If backuping the data is enabled, all the .list files of the results dir are copied to a backup results dir (which has the suffix '_bak'). If this backkup results dir already exists, no backup is performed. :param conf: the Codeface configuration object :param resdir: the Codeface results dir, where output files are written :param backup_data: whether to backup the current .list files before performing the postprocessing """ if backup_data: log.info("%s: Backup current data" % conf["project"]) results_path = path.join(resdir, conf["project"], conf["tagging"]) results_path_backup = path.join(resdir, conf["project"], conf["tagging"] + "_bak") perform_data_backup(results_path, results_path_backup) log.info("%s: Backup of current data complete!" % conf["project"]) authors_list = "authors.list" commits_list = "commits.list" emails_list = "emails.list" issues_github_list = "issues-github.list" issues_jira_list = "issues-jira.list" bugs_jira_list = "bugs-jira.list" bots_list = "bots.list" # When looking at elements originating from json lists, we need to consider quotation marks around the string quot_m = "\"" data_path = path.join(resdir, conf["project"], conf["tagging"]) # Correctly replace author 'GitHub <*****@*****.**>' in the commit data and in "commit_added" events of the # GitHub issue data and remove this author in the author data, bot data, and e-mail data fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list) log.info("%s: Postprocess authors after manual disambiguation" % conf["project"]) disambiguation_list = path.join(data_path, "disambiguation-after-db.list") # Check if a disambiguation list exists - if not, just stop if path.exists(disambiguation_list): disambiguation_data = csv_writer.read_from_csv(disambiguation_list) else: log.info("Disambiguation file does not exist: %s", disambiguation_list) log.info("No postprocessing performed!") return # Check for all files in the result directory of the project whether they need to be adjusted for filepath, dirnames, filenames in walk(data_path): # (1) Adjust authors lists if authors_list in filenames: f = path.join(filepath, authors_list) log.info("Postprocess %s ...", f) author_data = csv_writer.read_from_csv(f) author_data_to_remove = [] author_data_new = [] # get persons which should be removed for person in disambiguation_data: author_data_to_remove.append([person[3], person[4], person[5]]) for author in author_data: # keep author entry only if it should not be removed if not author in author_data_to_remove: author_data_new.append(author) csv_writer.write_to_csv(f, author_data_new) # (2) Adjust commits lists if commits_list in filenames: f = path.join(filepath, commits_list) log.info("Postprocess %s ...", f) commit_data = csv_writer.read_from_csv(f) for person in disambiguation_data: for commit in commit_data: # replace author if necessary if person[4] == commit[2] and person[5] == commit[3]: commit[2] = person[1] commit[3] = person[2] # replace committer if necessary if person[4] == commit[5] and person[5] == commit[6]: commit[5] = person[1] commit[6] = person[2] csv_writer.write_to_csv(f, commit_data) # (3) Adjust emails lists if emails_list in filenames: f = path.join(filepath, emails_list) log.info("Postprocess %s ...", f) email_data = csv_writer.read_from_csv(f) for person in disambiguation_data: for email in email_data: # replace author if necessary if person[4] == email[0] and person[5] == email[1]: email[0] = person[1] email[1] = person[2] csv_writer.write_to_csv(f, email_data) # (4) Adjust issues lists (github) if issues_github_list in filenames: f = path.join(filepath, issues_github_list) log.info("Postprocess %s ...", f) issue_data = csv_writer.read_from_csv(f) for person in disambiguation_data: for issue_event in issue_data: # replace author if necessary if person[4] == issue_event[9] and person[ 5] == issue_event[10]: issue_event[9] = person[1] issue_event[10] = person[2] # replace person in event info 1/2 if necessary if person[4] == issue_event[12] and ( quot_m + person[5] + quot_m) == issue_event[13]: issue_event[12] = person[1] issue_event[13] = quot_m + person[2] + quot_m csv_writer.write_to_csv(f, issue_data) # (5) Adjust issues lists (jira) if issues_jira_list in filenames: f = path.join(filepath, issues_jira_list) log.info("Postprocess %s ...", f) issue_data = csv_writer.read_from_csv(f) for person in disambiguation_data: for issue_event in issue_data: # replace author if necessary if person[4] == issue_event[9] and person[ 5] == issue_event[10]: issue_event[9] = person[1] issue_event[10] = person[2] # replace person in event info 1/2 if necessary if person[4] == issue_event[12] and ( quot_m + person[5] + quot_m) == issue_event[13]: issue_event[12] = person[1] issue_event[13] = quot_m + person[2] + quot_m csv_writer.write_to_csv(f, issue_data) # (6) Adjust bugs lists (jira) if bugs_jira_list in filenames: f = path.join(filepath, bugs_jira_list) log.info("Postprocess %s ...", f) bug_data = csv_writer.read_from_csv(f) for person in disambiguation_data: for bug_event in bug_data: # replace author if necessary if person[4] == bug_event[9] and person[5] == bug_event[10]: bug_event[9] = person[1] bug_event[10] = person[2] # replace person in event info 1/2 if necessary if person[4] == bug_event[12] and ( quot_m + person[5] + quot_m) == bug_event[13]: bug_event[12] = person[1] bug_event[13] = quot_m + person[2] + quot_m csv_writer.write_to_csv(f, bug_data) # (7) Adjust bots list if bots_list in filenames: f = path.join(filepath, bots_list) log.info("Postprocess %s ...", f) bot_data = csv_writer.read_from_csv(f) bot_data_new = [] bot_names_and_emails = dict() for person in disambiguation_data: for bot in bot_data: # replace author if necessary if person[4] == bot[0] and person[5] == bot[1]: bot[0] = person[1] bot[1] = person[2] # check for duplicate bot entries for bot in bot_data: # check if the bot is not already in the dict and add it if (bot[0], bot[1]) not in bot_names_and_emails: bot_names_and_emails[(bot[0], bot[1])] = bot else: # the bot is already in the list, check if there are different predictions stored_bot = bot_names_and_emails[(bot[0], bot[1])] if stored_bot[2] != bot[2]: # if either of the predictions is bot, keep bot if (stored_bot[2] == "Bot" or bot[2] == "Bot"): stored_bot[2] = "Bot" bot_names_and_emails[(bot[0], bot[1])] = stored_bot # otherwise, if either of the predictions is human, keep human elif (stored_bot[2] == "Human" or bot[2] == "Human"): stored_bot[2] = "Human" bot_names_and_emails[(bot[0], bot[1])] = stored_bot # determine final bot entries for bot in bot_data: updated_bot = bot_names_and_emails[(bot[0], bot[1])] if updated_bot not in bot_data_new: bot_data_new.append(updated_bot) csv_writer.write_to_csv(f, bot_data_new) log.info("Postprocessing complete!")
def print_to_disk_bugs(issues, results_folder): """ Sorts of bug issues and prints them to file "bugs-jira.list" in result folder This method prints in a new format which is consistent to the format of "print_to_disk_new" in "issue_processing.py". TODO When the network library is updated this format shall be used in all print to disk methods. :param issues: the issues to sort of bugs :param results_folder: the folder where to place "bugs-jira.list" output file """ # construct path to output file output_file = os.path.join(results_folder, "bugs-jira.list") log.info("Dumping output in file '{}'...".format(output_file)) # construct lines of output lines = [] for issue in issues: log.info("Current issue '{}'".format(issue["externalId"])) # only writes issues with type bug and their comments in the output file if "bug" in issue["type_new"]: lines.append(( issue["externalId"], issue["title"], issue["type_new"], issue["state_new"], issue["resolution_new"], issue["creationDate"], issue["resolveDate"], issue["components"], "created", ## event.name issue["author"]["name"], issue["author"]["email"], issue["creationDate"], "open", ## default state when created ["unresolved"] ## default resolution when created )) lines.append(( issue["externalId"], issue["title"], issue["type_new"], issue["state_new"], issue["resolution_new"], issue["creationDate"], issue["resolveDate"], issue["components"], "commented", issue["author"]["name"], issue["author"]["email"], issue["creationDate"], "open", ## default state when created ["unresolved"] ## default resolution when created )) for comment in issue["comments"]: lines.append(( issue["externalId"], issue["title"], issue["type_new"], issue["state_new"], issue["resolution_new"], issue["creationDate"], issue["resolveDate"], issue["components"], "commented", comment["author"]["name"], comment["author"]["email"], comment["changeDate"], comment["state_on_creation"], comment["resolution_on_creation"] )) for history in issue["history"]: lines.append(( issue["externalId"], issue["title"], issue["type_new"], issue["state_new"], issue["resolution_new"], issue["creationDate"], issue["resolveDate"], issue["components"], history["event"], history["author"]["name"], history["author"]["email"], history["date"], history["event_info_1"], history["event_info_2"] )) # write to output file csv_writer.write_to_csv(output_file, lines, append=True)
def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list): """ Replace the author "GitHub <*****@*****.**>" in both commit and GitHub issue data by the correct author. The author "GitHub <*****@*****.**>" is automatically inserted as the committer of a commit that is made when editing a file via the web frontend of GitHub. Hence, replace the committer of such commits with the commit's author, as author and committer are the same person in such a situation. This also holds for the "commit_added" event in GitHub issue data: As this usually uses the committer of a commit as its author, also use the commit's author as the author of the "commit_added" event. All other events in the GitHub issue data in which the author is "GitHub <*****@*****.**>" are removed. Also "mentioned" or "subscribed" events in the GitHub issue data which reference the author "GitHub <*****@*****.**>" are removed from the GitHub issue data. In addition, remove the author "GitHub <*****@*****.**>" also from the author data and bot data and remove e-mails that have been sent by this author. :param data_path: the path to the project data that is to be fixed :param issues_github_list: file name of the github issue data :param commits_list: file name of the corresponding commit data :param authors_list: file name of the corresponding author data :param emails_list: file name of the corresponding email data :param bots_list: file name of the corresponding bot data """ github_user = "******" github_email = "*****@*****.**" commit_added_event = "commit_added" mentioned_event = "mentioned" subscribed_event = "subscribed" """ Helper function to check whether a (name, e-mail) pair belongs to the author "GitHub <*****@*****.**>". There are two options in Codeface how this can happen: (1) Username is "GitHub" and e-mail address is "*****@*****.**" (2) Username is "GitHub" and e-mail address has been replaced by Codeface, resulting in "*****@*****.**" :param name: the name of the author to be checked :param email: the email address of the author to be checked :return: whether the given (name, email) pair belongs to the "GitHub <*****@*****.**>" author """ def is_github_noreply_author(name, email): return (name == github_user and (email == github_email or email == (github_user + "." + github_email))) # Check for all files in the result directory of the project whether they need to be adjusted for filepath, dirnames, filenames in walk(data_path): # (1) Remove author 'GitHub <*****@*****.**>' from authors list if authors_list in filenames: f = path.join(filepath, authors_list) log.info("Remove author %s <%s> in %s ...", github_user, github_email, f) author_data = csv_writer.read_from_csv(f) author_data_new = [] for author in author_data: # keep author entry only if it should not be removed if not is_github_noreply_author(author[1], author[2]): author_data_new.append(author) csv_writer.write_to_csv(f, author_data_new) # (2) Remove e-mails from author 'GitHub <*****@*****.**>' from all emails.list files if emails_list in filenames: f = path.join(filepath, emails_list) log.info("Remove emails from author %s <%s> in %s ...", github_user, github_email, f) email_data = csv_writer.read_from_csv(f) email_data_new = [] for email in email_data: # keep author entry only if it should not be removed if not is_github_noreply_author(email[0], email[1]): email_data_new.append(email) else: log.warn("Remove email %s as it was sent by %s <%s>.", email[2], email[0], email[1]) csv_writer.write_to_csv(f, email_data_new) # (3) Replace the committer 'GitHub <*****@*****.**>' in all commit.list files if commits_list in filenames: f = path.join(filepath, commits_list) log.info("Replace author %s <%s> in %s ...", github_user, github_email, f) commit_data = csv_writer.read_from_csv(f) for commit in commit_data: # replace committer 'GitHub <*****@*****.**>' by the commit's author # (as author and committer are identical when using GitHub's browser interface) if is_github_noreply_author(commit[5], commit[6]): commit[5] = commit[2] commit[6] = commit[3] csv_writer.write_to_csv(f, commit_data) # (4) Replace author 'GitHub <*****@*****.**>' in all "commit_added" events in the GitHub issue data # and remove all other events in which 'GitHub <*****@*****.**>' is either author or referenced. if issues_github_list in filenames: f = path.join(filepath, issues_github_list) log.info("Replace author %s <%s> in %s ...", github_user, github_email, f) issue_data = csv_writer.read_from_csv(f) # read commit data commit_data_file = path.join(data_path, commits_list) commit_data = csv_writer.read_from_csv(commit_data_file) commit_hash_to_author = { commit[7]: commit[2:4] for commit in commit_data } issue_data_new = [] for event in issue_data: # replace author if necessary if is_github_noreply_author( event[9], event[10]) and event[8] == commit_added_event: # extract commit hash from event info 1 commit_hash = event[12] # extract commit author from commit data, if available if commit_hash in commit_hash_to_author: event[9] = commit_hash_to_author[commit_hash][0] event[10] = commit_hash_to_author[commit_hash][1] issue_data_new.append(event) else: # the added commit is not part of the commit data. In most cases, this is due to merge commits # appearing in another pull request, as Codeface does not keep track of merge commits. As we # ignore merge commits in the commit data, we consistently ignore them also if they are added # to a pull request. Hence, the corresponding "commit_added" event will be removed now (i.e., # not added to the new issue data any more). log.warn( "Commit %s is added in the GitHub issue data, but not part of the commit data. " + "Remove the corresponding 'commit_added' event from the issue data...", commit_hash) elif is_github_noreply_author(event[9], event[10]): # the event is authored by 'GitHub <*****@*****.**>', but is not a "commit_added" event, so we # neglect this event and remove it now (i.e., not add it to the new issue data any more). log.warn( "Event %s is authored by %s <%s>. Remove this event form the issue data...", event[8], event[9], event[10]) elif (is_github_noreply_author(event[12], event[13][1:-1]) and (event[8] == mentioned_event or event[8] == subscribed_event)): # the event references 'GitHub <*****@*****.**>', so we neglect this event and remove it now # (i.e., not add it to the new issue data any more). log.warn( "Event %s by %s <%s> references %s <%s>. Remove this event from the issue data...", event[8], event[9], event[10], event[12], event[13]) else: issue_data_new.append(event) csv_writer.write_to_csv(f, issue_data_new) # (5) Remove author 'GitHub <*****@*****.**>' from bots.list if bots_list in filenames: f = path.join(filepath, bots_list) log.info("Remove author %s <%s> from %s ...", github_user, github_email, f) bot_data = csv_writer.read_from_csv(f) bot_data_new = [] for entry in bot_data: # keep bot entry only if it should not be removed if not is_github_noreply_author(entry[0], entry[1]): bot_data_new.append(entry) else: log.warn("Remove entry %s <%s> from bots list.", entry[0], entry[1]) csv_writer.write_to_csv(f, bot_data_new) log.info("Replacing GitHub user: Done.")
def print_to_disk_bugs(issues, results_folder, skip_history): """Sorts of bug issues and prints them to file 'bugs-jira.list' in result folder :param issues: the issues to sort of bugs :param results_folder: the folder where to place 'bugs-jira.list' output file :param skip_history: flag if history informations got retrieved and can be printed to the output file """ # construct path to output file output_file = os.path.join(results_folder, "bugs-jira.list") log.info("Dumping output in file '{}'...".format(output_file)) # construct lines of output lines = [] for issue in issues: log.info("Current issue '{}'".format(issue['externalId'])) # only writes issues with type bug and their comments in the output file if issue['type'] == "Bug": lines.append(( issue['externalId'], issue['state'], issue['resolution'], issue['creationDate'], issue['resolveDate'], False, ## Value of is.pull.request issue['author']['name'], issue['author']['email'], issue['creationDate'], issue['references'], "open", ## event.name issue['components'], "Open", ## default state when created "Unresolved" ## default resolution when created )) lines.append(( issue['externalId'], issue['state'], issue['resolution'], issue['creationDate'], issue['resolveDate'], False, ## Value of is.pull.request issue['author']['name'], issue['author']['email'], issue['creationDate'], "", ## ref.name "commented", ## event.name "", ##components "Open", ## default state when created "Unresolved" ## default resolution when created )) for comment in issue["comments"]: lines.append(( issue['externalId'], issue['state'], issue['resolution'], issue['creationDate'], issue['resolveDate'], False, ## Value of is.pull.request comment['author']['name'], comment['author']['email'], comment['changeDate'], "", ## ref.name "commented", ## event.name "", ##components comment['state_on_creation'], comment['resolution_on_creation'])) if not skip_history: for history in issue['history']: lines.append(( issue['externalId'], issue['state'], issue['resolution'], issue['creationDate'], issue['resolveDate'], False, ## Value of is.pull.request history['author']['name'], history['author']['email'], history['date'], "", ## ref.name "updated", ## event.name "", ##components history['new_state'], history['new_resolution'])) # write to output file csv_writer.write_to_csv(output_file, lines)
def run_anonymization(conf, resdir): """ Runs the anonymization process for the given parameters, that is, replaces names, e-mail addresses, message ids, and issue titles with pseudonymized contents in all .list files in resdir. Writes the anonymized .list files to another directory (resdir + "_threemonth"). :param conf: the Codeface configuration object :param resdir: the Codeface results dir, where result files are read from """ authors_list = "authors.list" commits_list = "commits.list" emails_list = "emails.list" issues_github_list = "issues-github.list" issues_jira_list = "issues-jira.list" bugs_jira_list = "bugs-jira.list" bots_list = "bots.list" gender_list = "gender.list" revisions_list = "revisions.list" # not to be anonymized, only to be copied to the "anonymized" directory # When looking at elements originating from json lists, we need to consider quotation marks around the string quot_m = "\"" data_path = path.join(resdir, conf["project"], conf["tagging"]) anonymize_path = path.join((resdir + "_anonymized"), conf["project"], conf["tagging"]) if not path.exists(anonymize_path): log.info("Create directory %s", anonymize_path) makedirs(anonymize_path) log.info("%s: Anonymize authors." % conf["project"]) # create dictionaries to store mappings from authors to anonymized authors and titles to anonymized titles author_to_anonymized_author = dict() author_to_anonymized_author_gender = dict() i = 0 i_gender = 0 title_to_anonymized_title = dict() k = 0 """ Helper function to anonymize author data (i.e., data from the authors.list file). :param author_data: the author data to be anonymized (must have been read via "csv_writer.read_from_csv") :param i: counter for anonymized developer names (i.e., its current start value which has not been used yet) :param author_to_anonymized_author: dictionary in which to lookup and store mappings from (name, e-mail) pairs to anonymized (name, e-mail) pairs for the developers :param name_only: whether also the name (without e-mail) should be used as key for the dictionary "author_to_anonymized_author". This is necessary if there might be lookups using auto-generated and, therefore, different e-mail addresses for the same name. :return: the anonymized "author_data", the current value of "i" (which has not been used yet), and the updated dictionary "author_to_anonymized_author" """ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only=False): for author in author_data: orig_author = author[1] orig_email = author[2] # Don't anonymize the deleted user as this one might be needed for filtering (but add it to the dictionary) if orig_author == "Deleted user" and orig_email == "*****@*****.**": if not (orig_author, orig_email) in author_to_anonymized_author: author_to_anonymized_author[(orig_author, orig_email)] = (orig_author, orig_email) else: # check whether (name, e-mail) pair isn't already present in the dictionary if not (orig_author, orig_email) in author_to_anonymized_author: # check if just the name (without e-mail address) isn't already present in the dictionary if not orig_author in author_to_anonymized_author: # if the author has an empty name, only anonymize their e-mail address if not author[1] == "": author[1] = ("developer" + str(i)) author[2] = ("mail" + str(i) + "@dev.org") # add new entry to dictionary (using (name, e-mail) pair as key) author_to_anonymized_author[(orig_author, orig_email)] = (author[1], author[2]) # if we allow name-only entries, also add an additional entry to dictionary if name_only: author_to_anonymized_author[orig_author] = ( author[1], author[2]) # increment counter as we have generated a new anonymized developer id i += 1 else: # as just the name (without e-mail address) is present in the dictionary, make a lookup # for the name only and add a new entry to the dictionary using (name, e-mail) pair author_new = author_to_anonymized_author[orig_author] author_to_anonymized_author[( orig_author, orig_email)] = (author_new[0], author_new[1]) author[1] = author_new[0] author[2] = author_new[1] else: # as the (name, e-mail) pair is present in the dictionary, just make a lookup for the pair author_new = author_to_anonymized_author[(orig_author, orig_email)] author[1] = author_new[0] author[2] = author_new[1] return author_data, i, author_to_anonymized_author # Check for all files in the result directory of the project whether they need to be anonymized for filepath, dirnames, filenames in walk(data_path): # (1) Anonymize authors lists if authors_list in filenames: f = path.join(filepath, authors_list) log.info("Anonymize %s ...", f) author_data = csv_writer.read_from_csv(f) author_data_gender = csv_writer.read_from_csv(f) # check if tagging is "feature" if conf["tagging"] == "feature": # as tagging is "feature", we need to check for the proximity data to keep anonymized ids consistent # over both feature and proximity data # if corresponding proximity data exists, read authors from proximity data and use them for # anonymization to make anonymized proximity data and feature data consistent f_proximity = f.replace("feature", "proximity") if path.isfile(f_proximity): log.info( "Read authors from %s and anonymize them (without dumping to file).", f_proximity) author_data_proximity = csv_writer.read_from_csv( f_proximity) # anonymize authors from proximity data (but just add them to our dictionary, to be used below # for the actual anonymization of the feature data) author_data_proximity, i, author_to_anonymized_author = \ anonymize_authors(author_data_proximity, i, author_to_anonymized_author, name_only = True) # anonymize authors author_data, i, author_to_anonymized_author = \ anonymize_authors(author_data, i, author_to_anonymized_author) author_data_gender, i_gender, author_to_anonymized_author_gender = \ anonymize_authors(author_data_gender, i_gender, author_to_anonymized_author_gender, name_only = True) output_path = f.replace(data_path, anonymize_path) if not path.exists(path.dirname(output_path)): makedirs(path.dirname(output_path)) log.info("Write anonymized data to %s ...", output_path) csv_writer.write_to_csv(output_path, author_data) # (2) Anonymize commits lists if commits_list in filenames: f = path.join(filepath, commits_list) log.info("Anonymize %s ...", f) commit_data = csv_writer.read_from_csv(f) for commit in commit_data: # anonymize author new_author = author_to_anonymized_author[(commit[2], commit[3])] commit[2] = new_author[0] commit[3] = new_author[1] # anonymize committer new_committer = author_to_anonymized_author[(commit[5], commit[6])] commit[5] = new_committer[0] commit[6] = new_committer[1] output_path = f.replace(data_path, anonymize_path) if not path.exists(path.dirname(output_path)): makedirs(output_path) log.info("Write anonymized data to %s ...", output_path) csv_writer.write_to_csv(output_path, commit_data) # (3) Anonymize emails lists if emails_list in filenames: f = path.join(filepath, emails_list) log.info("Anonymize %s ...", f) email_data = csv_writer.read_from_csv(f) j = 0 for email in email_data: # anonymize author new_author = author_to_anonymized_author[(email[0], email[1])] email[0] = new_author[0] email[1] = new_author[1] # anonymize message id email[2] = ("<message" + str(j) + "@message.dev.org>") j += 1 output_path = f.replace(data_path, anonymize_path) if not path.exists(path.dirname(output_path)): makedirs(path.dirname(output_path)) log.info("Write anonymized data to %s ...", output_path) csv_writer.write_to_csv(output_path, email_data) # (4) Anonymize issues lists (github) if issues_github_list in filenames: f = path.join(filepath, issues_github_list) log.info("Anonymize %s ...", f) issue_data = csv_writer.read_from_csv(f) for issue_event in issue_data: # anonymize author new_author = author_to_anonymized_author[(issue_event[9], issue_event[10])] issue_event[9] = new_author[0] issue_event[10] = new_author[1] # anonymize person in event info 1/2 if (issue_event[12], issue_event[13][1:-1]) in author_to_anonymized_author: new_person = author_to_anonymized_author[( issue_event[12], issue_event[13][1:-1])] issue_event[12] = new_person[0] issue_event[13] = quot_m + new_person[1] + quot_m # anonymize issue title if issue_event[1] in title_to_anonymized_title: issue_event[1] = title_to_anonymized_title[issue_event[1]] else: new_title = ("issue-title-" + str(k)) title_to_anonymized_title[issue_event[1]] = new_title issue_event[1] = new_title k += 1 output_path = f.replace(data_path, anonymize_path) if not path.exists(path.dirname(output_path)): makedirs(path.dirname(output_path)) log.info("Write anonymized data to %s ...", output_path) csv_writer.write_to_csv(output_path, issue_data) # (5) Anonymize issues lists (jira) if issues_jira_list in filenames: f = path.join(filepath, issues_jira_list) log.info("Anonymize %s ...", f) issue_data = csv_writer.read_from_csv(f) for issue_event in issue_data: # anonymize author new_author = author_to_anonymized_author[(issue_event[9], issue_event[10])] issue_event[9] = new_author[0] issue_event[10] = new_author[1] # anonymize person in event info 1/2 if (issue_event[12], issue_event[13][1:-1]) in author_to_anonymized_author: new_person = author_to_anonymized_author[( issue_event[12], issue_event[13][1:-1])] issue_event[12] = new_person[0] issue_event[13] = quot_m + new_person[1] + quot_m # anonymize issue title if issue_event[1] in title_to_anonymized_title: issue_event[1] = title_to_anonymized_title[issue_event[1]] else: new_title = ("issue-title-" + str(k)) title_to_anonymized_title[issue_event[1]] = new_title issue_event[1] = new_title k += 1 output_path = f.replace(data_path, anonymize_path) if not path.exists(path.dirname(output_path)): makedirs(path.dirname(output_path)) log.info("Write anonymized data to %s ...", output_path) csv_writer.write_to_csv(output_path, issue_data) # (6) Anonymize bugs lists (jira) if bugs_jira_list in filenames: f = path.join(filepath, bugs_jira_list) log.info("Anonymize %s ...", f) bug_data = csv_writer.read_from_csv(f) for bug_event in bug_data: # anonymize author new_author = author_to_anonymized_author[(bug_event[9], bug_event[10])] bug_event[9] = new_author[0] bug_event[10] = new_author[1] # anonymize person in event info 1/2 if (issue_event[12], issue_event[13][1:-1]) in author_to_anonymized_author: new_person = author_to_anonymized_author[( bug_event[12], bug_event[13][1:-1])] bug_event[12] = new_person[0] bug_event[13] = quot_m + new_person[1] + quot_m # anonymize bug title if bug_event[1] in title_to_anonymized_title: bug_event[1] = title_to_anonymized_title[bug_event[1]] else: new_title = ("issue-title-" + str(k)) title_to_anonymized_title[bug_event[1]] = new_title bug_event[1] = new_title k += 1 output_path = f.replace(data_path, anonymize_path) if not path.exists(path.dirname(output_path)): makedirs(path.dirname(output_path)) log.info("Write anonymized data to %s ...", output_path) csv_writer.write_to_csv(output_path, bug_data) # (7) Anonymize bots list if bots_list in filenames: f = path.join(filepath, bots_list) log.info("Anonymize %s ...", f) bot_data = csv_writer.read_from_csv(f) for bot in bot_data: new_person = author_to_anonymized_author[(bot[0], bot[1])] bot[0] = new_person[0] bot[1] = new_person[1] output_path = f.replace(data_path, anonymize_path) if not path.exists(path.dirname(output_path)): makedirs(path.dirname(output_path)) log.info("Write anonymized data to %s ...", output_path) csv_writer.write_to_csv(output_path, bot_data) # (8) Anonymize gender list if gender_list in filenames: f = path.join(filepath, gender_list) log.info("Anonymize %s ...", f) gender_data = csv_writer.read_from_csv(f) gender_data_new = [] for author in gender_data: if author[0] in author_to_anonymized_author_gender.keys(): new_person = author_to_anonymized_author_gender[author[0]] author[0] = new_person[0] gender_data_new.append(author) output_path = f.replace(data_path, anonymize_path) if not path.exists(path.dirname(output_path)): makedirs(path.dirname(output_path)) log.info("Write anonymized data to %s ...", output_path) csv_writer.write_to_csv(output_path, gender_data_new) # (9) Copy revisions list if revisions_list in filenames: f = path.join(filepath, revisions_list) log.info("Copy %s ...", f) revision_data = csv_writer.read_from_csv(f) output_path = f.replace(data_path, anonymize_path) if not path.exists(path.dirname(output_path)): makedirs(path.dirname(output_path)) log.info("Copy revision data to %s ...", output_path) csv_writer.write_to_csv(output_path, revision_data) log.info("Anonymization complete!")
def insert_user_data(issues, conf, resdir): """ Insert user data into database and update issue data. In addition, dump username-to-user list to file. :param issues: the issues to retrieve user data from :param conf: the project configuration :param resdir: the directory in which the username-to-user-list should be dumped :return: the updated issue data """ log.info("Syncing users with ID service...") # create buffer for users (key: user id) user_buffer = dict() # create buffer for user ids (key: user string) user_id_buffer = dict() # create buffer for usernames (key: username) username_id_buffer = dict() # open database connection dbm = DBManager(conf) # open ID-service connection idservice = idManager(dbm, conf) def get_user_string(name, email): if not email or email is None: return "{name}".format(name=name) # return "{name} <{name}@default.com>".format(name=name) # for debugging only else: return "{name} <{email}>".format(name=name, email=email) def get_id_and_update_user(user, buffer_db_ids=user_id_buffer, buffer_usernames=username_id_buffer): username = unicode(user["username"]).encode("utf-8") # fix encoding for name and e-mail address if user["name"] is not None: name = unicode(user["name"]).encode("utf-8") else: name = username mail = unicode(user["email"]).encode("utf-8") # construct string for ID service and send query user_string = get_user_string(name, mail) # check buffer to reduce amount of DB queries if user_string in buffer_db_ids: log.devinfo( "Returning person id for user '{}' from buffer.".format( user_string)) if username is not None: buffer_usernames[username] = buffer_db_ids[user_string] return buffer_db_ids[user_string] # get person information from ID service log.devinfo("Passing user '{}' to ID service.".format(user_string)) idx = idservice.getPersonID(user_string) # add user information to buffer # user_string = get_user_string(user["name"], user["email"]) # update for buffer_db_ids[user_string] = idx # add id to username buffer if username is not None: buffer_usernames[username] = idx return idx def get_user_from_id(idx, buffer_db=user_buffer): # check whether user information is in buffer to reduce amount of DB queries if idx in buffer_db: log.devinfo("Returning user '{}' from buffer.".format(idx)) return buffer_db[idx] # get person information from ID service log.devinfo("Passing user id '{}' to ID service.".format(idx)) person = idservice.getPersonFromDB(idx) user = dict() user["email"] = person["email1"] # column "email1" user["name"] = person["name"] # column "name" user["id"] = person["id"] # column "id" # add user information to buffer buffer_db[idx] = user return user # check and update database for all occurring users for issue in issues: # check database for issue author issue["user"] = get_id_and_update_user(issue["user"]) # check database for event authors for event in issue["eventsList"]: event["user"] = get_id_and_update_user(event["user"]) # check database for the reference-target user if needed if event["ref_target"] != "": event["ref_target"] = get_id_and_update_user( event["ref_target"]) # get all users after database updates having been performed for issue in issues: # get issue author issue["user"] = get_user_from_id(issue["user"]) # get event authors for event in issue["eventsList"]: event["user"] = get_user_from_id(event["user"]) # get the reference-target user if needed if event["ref_target"] != "": event["ref_target"] = get_user_from_id(event["ref_target"]) event["event_info_1"] = event["ref_target"]["name"] event["event_info_2"] = event["ref_target"]["email"] # dump username, name, and e-mail to file lines = [] for username in username_id_buffer: user = get_user_from_id(username_id_buffer[username]) lines.append((username, user["name"], user["email"])) log.info("Dump username list to file...") username_dump = os.path.join(resdir, "usernames.list") csv_writer.write_to_csv(username_dump, sorted(set(lines), key=lambda line: line[0])) return issues
def print_to_disk_bugs(issues, results_folder): """ Extract bug issues and prints them to file "bugs-jira.list" in result folder. This method prints in a format which is consistent to the format of "print_to_disk" in "issue_processing.py". :param issues: the issues to sort of bugs :param results_folder: the folder where to place "bugs-jira.list" output file """ # construct path to output file output_file = os.path.join(results_folder, "bugs-jira.list") log.info("Dumping output in file '{}'...".format(output_file)) # construct lines of output lines = [] for issue in issues: log.info("Current issue '{}'".format(issue["externalId"])) # only write issues with type bug and their comments in the output file if "bug" in issue["type_list"]: # add the creation event lines.append(( issue["externalId"], issue["title"], json.dumps(issue["type_list"]), issue["state_new"], json.dumps(issue["resolution_list"]), issue["creationDate"], issue["resolveDate"], json.dumps(issue["components"]), "created", ## event.name issue["author"]["name"], issue["author"]["email"], issue["creationDate"], "open", ## default state when created json.dumps(["unresolved"]) ## default resolution when created )) # add an additional commented event for the creation lines.append(( issue["externalId"], issue["title"], json.dumps(issue["type_list"]), issue["state_new"], json.dumps(issue["resolution_list"]), issue["creationDate"], issue["resolveDate"], json.dumps(issue["components"]), "commented", issue["author"]["name"], issue["author"]["email"], issue["creationDate"], "open", ## default state when created json.dumps(["unresolved"]) ## default resolution when created )) # add comment events for comment in issue["comments"]: lines.append( (issue["externalId"], issue["title"], json.dumps(issue["type_list"]), issue["state_new"], json.dumps(issue["resolution_list"]), issue["creationDate"], issue["resolveDate"], json.dumps(issue["components"]), "commented", comment["author"]["name"], comment["author"]["email"], comment["changeDate"], comment["state_on_creation"], json.dumps(comment["resolution_on_creation"]))) # add history events for history in issue["history"]: lines.append( (issue["externalId"], issue["title"], json.dumps(issue["type_list"]), issue["state_new"], json.dumps(issue["resolution_list"]), issue["creationDate"], issue["resolveDate"], json.dumps(issue["components"]), history["event"], history["author"]["name"], history["author"]["email"], history["date"], history["event_info_1"], json.dumps(history["event_info_2"]))) # write to output file csv_writer.write_to_csv(output_file, lines, append=True)
def print_to_disk(issues, results_folder): """ Print issues to file "issues-jira.list" in result folder. :param issues: the issues to dump :param results_folder: the folder where to place "issues-jira.list" output file """ # construct path to output file output_file = os.path.join(results_folder, "issues-jira.list") log.info("Dumping output in file '{}'...".format(output_file)) # construct lines of output lines = [] for issue in issues: log.info("Current issue '{}'".format(issue["externalId"])) # add the creation event lines.append(( issue["externalId"], issue["title"], json.dumps(issue["type_list"]), issue["state_new"], json.dumps(issue["resolution_list"]), issue["creationDate"], issue["resolveDate"], json.dumps(issue["components"]), "created", ## event.name issue["author"]["name"], issue["author"]["email"], issue["creationDate"], "open", ## default state when created json.dumps(["unresolved"]) ## default resolution when created )) # add an additional commented event for the creation lines.append(( issue["externalId"], issue["title"], json.dumps(issue["type_list"]), issue["state_new"], json.dumps(issue["resolution_list"]), issue["creationDate"], issue["resolveDate"], json.dumps(issue["components"]), "commented", issue["author"]["name"], issue["author"]["email"], issue["creationDate"], "open", ## default state when created json.dumps(["unresolved"]) ## default resolution when created )) # add comment events for comment in issue["comments"]: lines.append( (issue["externalId"], issue["title"], json.dumps(issue["type_list"]), issue["state_new"], json.dumps(issue["resolution_list"]), issue["creationDate"], issue["resolveDate"], json.dumps(issue["components"]), "commented", comment["author"]["name"], comment["author"]["email"], comment["changeDate"], comment["state_on_creation"], json.dumps(comment["resolution_on_creation"]))) # add history events for history in issue["history"]: lines.append( (issue["externalId"], issue["title"], json.dumps(issue["type_list"]), issue["state_new"], json.dumps(issue["resolution_list"]), issue["creationDate"], issue["resolveDate"], json.dumps(issue["components"]), history["event"], history["author"]["name"], history["author"]["email"], history["date"], history["event_info_1"], json.dumps(history["event_info_2"]))) # write to output file csv_writer.write_to_csv(output_file, lines, append=True)