def get_or_update_user(user, buffer_db=user_buffer): # fix encoding for name and e-mail address if user["name"] is not None: name = unicode(user["name"]).encode("utf-8") else: name = unicode(user["username"]).encode("utf-8") mail = unicode(user["email"]).encode("utf-8") # construct string for ID service and send query user_string = get_user_string(name, mail) # check buffer to reduce amount of DB queries if user_string in buffer_db: log.devinfo("Returning user '{}' from buffer.".format(user_string)) return buffer_db[user_string] # get person information from ID service log.devinfo("Passing user '{}' to ID service.".format(user_string)) idx = idservice.getPersonID(user_string) # update user data with person information from DB person = idservice.getPersonFromDB(idx) user["email"] = person["email1"] # column "email1" user["name"] = person["name"] # column "name" user["id"] = person["id"] # column "id" # add user information to buffer # user_string = get_user_string(user["name"], user["email"]) # update for buffer_db[user_string] = user return user
def get_or_update_user(user, buffer_db=user_buffer): # fix encoding for name and e-mail address if user["name"] is not None: name = unicode(user["name"]).encode("utf-8") else: name = unicode(user["username"]).encode("utf-8") mail = unicode(user["email"]).encode("utf-8") # construct string for ID service and send query user_string = get_user_string(name, mail) # check buffer to reduce amount of DB queries if user_string in buffer_db: log.devinfo("Returning user '{}' from buffer.".format(user_string)) return buffer_db[user_string] # get person information from ID service log.devinfo("Passing user '{}' to ID service.".format(user_string)) idx = idservice.getPersonID(user_string) # update user data with person information from DB person = idservice.getPersonFromDB(idx) user["email"] = person["email1"] # column 'email1' user["name"] = person["name"] # column 'name' user["id"] = person["id"] # column 'id' # add user information to buffer # user_string = get_user_string(user["name"], user["email"]) # update for buffer_db[user_string] = user return user
def __parse_execute(artifact, schema, index, include_filepath): """ Execute the search for the given commit :param artifact: the (file name, artifact) tuple to search for :param schema: the search schema to use :param index: the search index to use :param include_filepath: indicator whether to use the 'file name' part of the artifact into account :return: a match list of tuples (file name, artifact, message ID) """ log.devinfo("Searching for artifact ({}, {})...".format( artifact[0], artifact[1])) result = [] with index.searcher() as searcher: # initialize query parser query_parser = QueryParser("content", schema=schema) # construct query if include_filepath: my_query = query_parser.parse(artifact[0] + " AND " + artifact[1]) else: my_query = query_parser.parse(artifact[1]) # search! query_result = searcher.search(my_query, terms=True) # construct result from query answer for r in query_result: result_tuple = (artifact[0], artifact[1], r["messageID"]) result.append(result_tuple) return result
def __mbox_getbody(message): """ Gett plain-text e-mail body for better searching. :param message: the mbox message to process :return: the unicode-encoded message body """ __text_indicator = "text/" body = None if message.is_multipart(): for part in message.walk(): if part.is_multipart(): for subpart in part.walk(): if __text_indicator in subpart.get_content_type(): body = subpart.get_payload(decode=True) elif __text_indicator in part.get_content_type(): body = part.get_payload(decode=True) elif __text_indicator in message.get_content_type(): body = message.get_payload(decode=True) if body is None: log.devinfo(message.get_content_type()) log.devinfo( "An image or some other content has been found that cannot be indexed. Message is given an empty body." ) body = ' ' return unicode(body, errors="replace")
def load_xml(source_folder): """Load issues from disk. :param source_folder: the folder where to find .xml-files :return: the loaded issue data """ filelist = [ f for f in os.listdir(source_folder) if os.path.isfile(os.path.join(source_folder, f)) ] issue_data = list() for file in filelist: srcfile = os.path.join(source_folder, file) log.devinfo("Loading issues from file '{}'...".format(srcfile)) # check if file exists and exit early if not if not os.path.exists(srcfile): log.info("Issue file '{}' does not exist! Exiting early...".format( srcfile)) sys.exit(-1) # with open(srcfile, 'r') as issues_file: xmldoc = parse(srcfile) issue_data.append(xmldoc) return issue_data
def get_id_and_update_user(user, buffer_db_ids=user_id_buffer): # fix encoding for name and e-mail address if user["name"] is not None and user["name"] != "": name = unicode(user["name"]).encode("utf-8") else: name = unicode(user["username"]).encode("utf-8") mail = unicode(user["email"]).encode("utf-8") # empty # construct string for ID service and send query user_string = get_user_string(name, mail) # check buffer to reduce amount of DB queries if user_string in buffer_db_ids: log.devinfo( "Returning person id for user '{}' from buffer.".format( user_string)) return buffer_db_ids[user_string] # get person information from ID service log.devinfo("Passing user '{}' to ID service.".format(user_string)) idx = idservice.getPersonID(user_string) # add user information to buffer # user_string = get_user_string(user["name"], user["email"]) # update for buffer_db_ids[user_string] = idx return idx
def reformat_issues(issue_data): """ Re-arrange issue data structure. :param issue_data: the issue data to re-arrange :return: the re-arranged issue data """ log.devinfo("Re-arranging Github issues...") # re-process all issues for issue in issue_data: # empty container for issue types issue["type"] = [] # empty container for issue resolutions issue["resolution"] = [] # if an issue has no eventsList, an empty List gets created if issue["eventsList"] is None: issue["eventsList"] = [] # if an issue has no commentsList, an empty List gets created if issue["commentsList"] is None: issue["commentsList"] = [] # if an issue has no relatedCommits, an empty List gets created if issue["relatedCommits"] is None: issue["relatedCommits"] = [] # if an issue has no reviewsList, an empty Listgets created if issue["reviewsList"] is None: issue["reviewsList"] = [] # if an issue has no relatedIssues, an empty List gets created if "relatedIssues" not in issue: issue["relatedIssues"] = [] # add "closed_at" information if not present yet if issue["closed_at"] is None: issue["closed_at"] = "" # parses the creation time in the correct format issue["created_at"] = format_time(issue["created_at"]) # parses the close time in the correct format issue["closed_at"] = format_time(issue["closed_at"]) # checks if the issue is a pull-request or a normal issue and adapts the type if issue["isPullRequest"]: issue["type"].append("pull request") else: issue["type"].append("issue") return issue_data
def reformat(issue_data): """Re-arrange issue data structure. :param issue_data: the issue data to re-arrange :return: the re-arranged issue data """ log.devinfo("Re-arranging Github issues...") # re-process all issues for issue in issue_data: # temporary container for references comments = dict() # initialize event created_event = dict() created_event["user"] = issue["user"] created_event["created_at"] = issue["created_at"] created_event["event"] = "created" issue["eventsList"].append(created_event) # add event name to comment and add reference target for comment in issue["commentsList"]: comment["event"] = "commented" comment["ref_target"] = "" # cache comment by date to resolve/re-arrange references later comments[comment["created_at"]] = comment # add reference target to events for event in issue["eventsList"]: event["ref_target"] = "" # if event collides with a comment if event["created_at"] in comments: comment = comments[event["created_at"]] # if someone gets mentioned or subscribed by someone else in a comment, # re-write the reference if (event["event"] == "mentioned" or event["event"] == "subscribed") and \ comment["event"] == "commented": event["ref_target"] = event["user"] event["user"] = comment["user"] # merge events and comment lists issue["eventsList"] = issue["commentsList"] + issue["eventsList"] # add 'closed_at' information if not present yet if issue["closed_at"] is None: issue["closed_at"] = "" # remove events without user issue["eventsList"] = [ event for event in issue["eventsList"] if not (event["user"] is None or event["ref_target"] is None) ] return issue_data
def load_csv(source_folder): """ Load persons from disk. :param source_folder: the folder where to find .csv-file :return: the loaded person data contained in a dict consisting of two maps: keys are either name ("by_name") or username ("by_username"), values are name-email pairs """ def find_first_existing(source_folder, filenames): """ Check if any of the given file names exist in the given folder and return the first existing. :param source_folder: the folder where to search for the given file names :param filenames: the file names to search for :return: the first existing file name, None otherwise """ filenames = map(lambda fi: os.path.join(source_folder, fi), filenames) existing = map(lambda fi: os.path.exists(fi), filenames) first = next((i for (i, x) in enumerate(existing) if x), None) if first is not None: return filenames[first] else: return None person_files = ("jira-comment-authors-with-email.csv", "jira_issue_comments.csv") srcfile = find_first_existing(source_folder, person_files) # check if file exists and exit early if not if not srcfile: log.error("Person files '{}' do not exist! Exiting early...".format( person_files)) sys.exit(-1) log.devinfo("Loading person csv from file '{}'...".format(srcfile)) with open(srcfile, "r") as f: person_data = csv.DictReader(f, delimiter=",", skipinitialspace=True) persons_by_username = {} persons_by_name = {} for row in person_data: if not row["AuthorID"] in persons_by_username.keys(): author_id_utf8 = unicode(row["AuthorID"]).encode("utf-8") persons_by_username[author_id_utf8] = (row["AuthorName"], row["userEmail"]) if not row["AuthorName"] in persons_by_name.keys(): author_name_utf8 = unicode(row["AuthorName"]).encode("utf-8") persons_by_name[author_name_utf8] = (row["AuthorName"], row["userEmail"]) persons = dict() persons["by_username"] = persons_by_username persons["by_name"] = persons_by_name return persons
def parse(mbox_name, results, include_filepath): mbox = mailbox.mbox(mbox_name) my_schema = Schema(messageID=ID(stored=True), content=TEXT) index_path = results + "/index" # The index for Whoosh full text search is being created. If an index already exists this step won't be performed if (not os.path.exists(index_path)) or (not exists_in(index_path)): log.devinfo("Creating Index in results folder for text search.") os.mkdir(index_path) ix = create_in(index_path, my_schema) ix = open_dir(index_path) writer = ix.writer() for message in mbox: writer.add_document(messageID=unicode(message['message-id']), content=getbody(message)) writer.commit() log.devinfo("Index created, parsing will begin now.") else: log.devinfo( "Index has already been created, parsing will begin right away.") ix = open_dir(index_path) # Get the search terms from the commits.list file commit_list = open(results + "/commits.list", 'r') commits_seperated = [] commits = [] commit_set = set() for line in commit_list: commits.append(line) for commit in commits: commit_seperated = str.split(commit, ';') commit_set.add((commit_seperated[9], commit_seperated[10])) commits_seperated.append(commit_seperated) if include_filepath: my_file = open(results + "mboxParsing_filepath.csv", 'w') else: my_file = open(results + "mboxParsing.csv", 'w') # Paralell execution call for the text search. num_cores = multiprocessing.cpu_count() csv_data = Parallel(n_jobs=num_cores - 1)( delayed(execute)(commit, my_schema, ix, include_filepath) for commit in commit_set) # Writes found hits to file. log.devinfo("Parsing done writing to file commences.") wr = csv.writer(my_file, delimiter=';') wr.writerow(('file', 'artifact', 'message_id')) for entry in csv_data: for row in entry: wr.writerow(row) my_file.close() log.devinfo("Writing done and file closed.")
def reformat_issues(issue_data): """ Re-arrange issue data structure. :param issue_data: the issue data to re-arrange :return: the re-arranged issue data """ log.devinfo("Re-arranging Github issues...") # re-process all issues for issue in issue_data: # empty container for issue types issue["type"] = [] # empty container for issue resolutions issue["resolution"] = [] # if an issue has no eventsList, an empty List gets created if issue["eventsList"] is None: issue["eventsList"] = [] # if an issue has no commentsList, an empty List gets created if issue["commentsList"] is None: issue["commentsList"] = [] # if an issue has no relatedCommits, an empty List gets created if issue["relatedCommits"] is None: issue["relatedCommits"] = [] # if an issue has no relatedIssues, an empty List gets created if "relatedIssues" not in issue: issue["relatedIssues"] = [] # add "closed_at" information if not present yet if issue["closed_at"] is None: issue["closed_at"] = "" # parses the creation time in the correct format issue["created_at"] = format_time(issue["created_at"]) # parses the close time in the correct format issue["closed_at"] = format_time(issue["closed_at"]) # checks if the issue is a pull-request or a normal issue and adapts the type if issue["isPullRequest"]: issue["type"].append("pull request") else: issue["type"].append("issue") return issue_data
def execute(commit, my_schema, ix, include_filepath): result = [] with ix.searcher() as searcher: query_parser = QueryParser("content", schema=my_schema) if include_filepath: my_query = query_parser.parse(commit[0] + " AND " + commit[1]) else: my_query = query_parser.parse(commit[1]) query_result = searcher.search(my_query, terms=True) for r in query_result: result_tuple = (commit[0][1:-1], commit[1][1:-1], r["messageID"]) result.append(result_tuple) log.devinfo("Artifact " + commit[0][1:-1] + ", " + commit[1][1:-1] + " done!") return result
def load_csv(source_folder): """Load persons from disk. :param source_folder: the folder where to find .csv-file :return: the loaded person data """ def find_first_existing(source_folder, filenames): """ Check if any of the given file names exist in the given folder and return the first existing. :param source_folder: the folder where to search for the given file names :param filenames: the file names to search for :return: the first existing file name, None otherwise """ filenames = map(lambda fi: os.path.join(source_folder, fi), filenames) existing = map(lambda fi: os.path.exists(fi), filenames) first = next((i for (i, x) in enumerate(existing) if x), None) if first is not None: return filenames[first] else: return None person_files = ("jira-comment-authors-with-email.csv", "jira_issue_comments.csv") srcfile = find_first_existing(source_folder, person_files) # check if file exists and exit early if not if not srcfile: log.error("Person files '{}' do not exist! Exiting early...".format( person_files)) sys.exit(-1) log.devinfo("Loading person csv from file '{}'...".format(srcfile)) with open(srcfile, 'r') as f: person_data = csv.DictReader(f, delimiter=',', skipinitialspace=True) persons = {} for row in person_data: if not row['AuthorID'] in persons.keys(): persons[row['AuthorID']] = (row['AuthorName'], row['userEmail']) return persons
def load_xml(source_folder, xml_file): """ Load issues from disk. :param source_folder: the folder where to .xml-file is in :param xml_file: the given xml-file :return: the loaded issue data """ srcfile = os.path.join(source_folder, xml_file) log.devinfo("Loading issues from file '{}'...".format(srcfile)) try: # parse the xml-file issue_data = parse(srcfile) return issue_data except Exception as e: log.info("Issue file " + format(srcfile) + " couldn't be opened because of a " + e.__class__.__name__) return None
def load(source_folder): """Load issues from disk. :param source_folder: the folder where to find 'issues.json' :return: the loaded issue data """ srcfile = os.path.join(source_folder, "issues.json") log.devinfo("Loading Github issues from file '{}'...".format(srcfile)) # check if file exists and exit early if not if not os.path.exists(srcfile): log.error("Github issue file '{}' does not exist! Exiting early...".format(srcfile)) sys.exit(-1) with open(srcfile) as issues_file: issue_data = json.load(issues_file) return issue_data
def get_user_from_id(idx, buffer_db=user_buffer): # check whether user information is in buffer to reduce amount of DB queries if idx in buffer_db: log.devinfo("Returning user '{}' from buffer.".format(idx)) return buffer_db[idx] # get person information from ID service log.devinfo("Passing user id '{}' to ID service.".format(idx)) person = idservice.getPersonFromDB(idx) user = dict() user["email"] = person["email1"] # column "email1" user["name"] = person["name"] # column "name" user["id"] = person["id"] # column "id" # add user information to buffer buffer_db[idx] = user return user
def getbody(message): body = None if message.is_multipart(): for part in message.walk(): if part.is_multipart(): for subpart in part.walk(): if 'text/' in subpart.get_content_type(): body = subpart.get_payload(decode=True) elif 'text/' in part.get_content_type(): body = part.get_payload(decode=True) elif 'text/' in message.get_content_type(): body = message.get_payload(decode=True) if body is None: log.devinfo(message.get_content_type()) log.devinfo( "An image or some other content that can not be indexed has been found. Message is given an empty body." ) body = ' ' return unicode(body, errors="replace")
def __get_index(mbox, mbox_path, results_folder, schema, reindex): """Initialize the search index (and create it, if needed :param mbox: the mbox object to create the index for :param mbox_path: the path to the mbox object on disk :param results_folder: the folder to create the index folder in :param schema: the schema for the to be created index :param reindex: force reindexing if True :return: the opened index object """ # create or load index: # 0) construct index path index_path = os.path.join(results_folder, "mbox-index", os.path.basename(mbox_path)) # 1) if reindexing, remove the index folder if os.path.exists(index_path) and reindex: log.devinfo("Removing index from path '{}'...".format(index_path)) shutil.rmtree(index_path) # 2) Check if we need to create the index for Whoosh full-text search log.devinfo("Checking for index in results folder...") if (not os.path.exists(index_path)) or (not index.exists_in(index_path)): # 2.1) create index log.devinfo("Creating index for text search in results folder.") os.makedirs(index_path) # create path index.create_in(index_path, schema) # initialize as index path ix = index.open_dir(index_path) # open as index path writer = ix.writer() # add all messages to index for message in mbox: writer.add_document(messageID=unicode(message['message-id']), content=__mbox_getbody(message)) writer.commit() log.devinfo("Index created, parsing will begin now.") else: # 2.2) load index log.devinfo( "Index has already been created, parsing will begin right away.") ix = index.open_dir(index_path) return ix
def load(source_folder): """Load issues from disk. :param source_folder: the folder where to find 'issues.json' :return: the loaded issue data """ srcfile = os.path.join(source_folder, "issues.json") log.devinfo("Loading Github issues from file '{}'...".format(srcfile)) # check if file exists and exit early if not if not os.path.exists(srcfile): log.error( "Github issue file '{}' does not exist! Exiting early...".format( srcfile)) sys.exit(-1) with open(srcfile) as issues_file: issue_data = json.load(issues_file) return issue_data