def generate_date_to_top_commenters(project_name, sws): """ Generate a mapping from date to number of comment made until that date. Large change sets are not exluded because the comments made to the issues related to the large change sets still exist. Parameters ---------- project_name (str): Name of the project sws (int): Sliding_window_size, in other words number of days to include the graph. Returns -------- dict: Mapping from date to top commenters and their numbers of comments in the sliding window ending that date. """ issue_to_commenters = generate_issue_to_commenters(project_name) data_manager = DataManager(get_dataset_path(project_name), sws) # Get initial change sets to add and remove change_sets_add = data_manager.get_initial_window() change_sets_remove = {} top_commenters = defaultdict(lambda: 0) date_to_top_commenters = {} while True: # Add change sets for change_set in change_sets_add: for issue_id in change_set.issues: for commenter in issue_to_commenters.get(issue_id, []): top_commenters[commenter] += 1 # Remove change sets for change_set in change_sets_remove: for issue_id in change_set.issues: for commenter in issue_to_commenters.get(issue_id, []): top_commenters[commenter] -= 1 if top_commenters[commenter] <= 0: del top_commenters[commenter] date = data_manager.get_last_included_date() date_to_top_commenters[date] = sort_dict(top_commenters, by_value=True, reverse=True) try: change_sets_add, change_sets_remove = data_manager.forward_one_day( ) except SlidingNotPossible: break return date_to_top_commenters
def generate_date_to_top_committers(project_name, sws): """ Generate a mapping from date to number of commits made until that date. TODO: Large change sets can be excluded. Parameters ---------- project_name (str): Name of the project. sws (int): Sliding_window_size. Returns -------- dict: Mapping from date to top committers and their numbers of commits in the sliding window ending that date. """ data_manager = DataManager(get_dataset_path(project_name), sws) # Get initial change sets to add and remove change_sets_add = data_manager.get_initial_window() change_sets_remove = {} top_committers = defaultdict(lambda: 0) date_to_top_committers = {} while True: # Add change sets for change_set in change_sets_add: top_committers[change_set.author] += 1 # Remove change sets for change_set in change_sets_remove: author = change_set.author top_committers[author] -= 1 if top_committers[author] <= 0: del top_committers[author] date = data_manager.get_last_included_date() date_to_top_committers[date] = sort_dict( top_committers, by_value=True, reverse=True ) try: change_sets_add, change_sets_remove = data_manager.forward_one_day() except SlidingNotPossible: break return date_to_top_committers
def launch_training_docker_task(base_image_name, result_image_name, classifier_id, train_set_name, epoch, weights=None): if weights is None: weights = '/VGG_CNN_M_1024.v2.caffemodel' task_id = str(classifier_id) + '-' + str(random.getrandbits(32)) print 'classifier id %s, train set %s, epoch %s, weight %s ' % ( str(classifier_id), str(train_set_name), str(epoch), str(weights)) dataset_path = util.get_dataset_path() train_task.apply_async((base_image_name, result_image_name, dataset_path, classifier_id, train_set_name, epoch, weights), task_id=task_id) return task_id
def dataset_details_after_preprocess(): """ Generate statistics after dataset preprocessing for each project. """ print("\n*** Dataset Details After Preprocessing ***\n") print( "Project # Developers # CS # CS > 10 # CS > 50") for project_name in project_list: dataset_path = get_dataset_path(project_name) dm = DataManager(dataset_path, None) developers = set() nums_cs = 0 nums_cs_10 = 0 nums_cs_50 = 0 add_or_modify = set(["MODIFY", "ADD"]) for date, change_sets in dm._date_to_change_sets.items(): for cs in change_sets: developers.add(cs.author) files_add_modify = [] for cc in cs.code_changes: if cc.change_type in add_or_modify: files_add_modify.append(cc.file_path) # Increase counters nums_cs += 1 if len(files_add_modify) > 10: nums_cs_10 += 1 if len(files_add_modify) > 50: nums_cs_50 += 1 print("{:<15}{}\t\t{}\t{:>5}({:.2f})\t{:>5}({:.2f})".format( project_name, len(developers), nums_cs, nums_cs_10, 100 * nums_cs_10 / nums_cs, nums_cs_50, 100 * nums_cs_50 / nums_cs, )) print()
def leaving_developers_table(): """ Generate the number of leaving developers for each project. """ print("\n*** Number of Leaving Developers ***\n") print("Absence Limit ", ("{:<15}" * len(project_list)).format(*project_list)) for absence_limit in sws_list: print("{:<15}".format(absence_limit), end="") for project_name in project_list: dataset_path = get_dataset_path(project_name) G = HistoryGraph(dataset_path, sliding_window_size=absence_limit) date_to_leaving_developers = find_leaving_developers(G) leaving_developers = [ dev for devs in date_to_leaving_developers.values() for dev in devs ] print("{:<15}".format(len(leaving_developers)), end="") print() print()
def average_number_of_developers(): """ Generate the average number of developers in the graph for each project. """ print("\n*** Average Number of Developer ***\n") print("{:<15}".format("SWS"), end="") print(("{:<15}" * len(project_list)).format(*project_list)) for sws in sws_list: print("{:<15}".format(sws), end="") for project_name in project_list: dataset_path = get_dataset_path(project_name) G = HistoryGraph(dataset_path, sliding_window_size=sws) dev_nums = [] while True: devs = G.get_developers() dev_nums.append(len(devs)) if not G.forward_graph_one_day(): break avg_dev_num = sum(dev_nums) / len(dev_nums) print("{:<15.2f}".format(avg_dev_num), end="") print() print()
def create_evaluation(classifier_id, name, video_list): classifier = Classifier.query.filter( Classifier.id == classifier_id).first() if not classifier: print 'classifier not exist' return None label_list = classifier.labels.split(',') evaluation_set = EvaluationSet(name=name) for video_id in video_list: video = Video.query.filter(Video.id == video_id).first() evaluation_set.videos.append(video) classifier.evaluation_sets.append(evaluation_set) db.session.add(evaluation_set) db.session.commit() print 'created evaluation set with name %s id %s ' % ( str(name), str(evaluation_set.id)) evaluation_result_name = str(evaluation_set.id) base_image_name = util.get_classifier_image_name(classifier.name, classifier.id) # prepare label data image_list_file_path, label_list_file_path, label_name_file_path = turkic_replacement.dump_image_and_label_files( video_list, label_list, remove_none_frame=True) # note: this evaluation set name is the unique name to indicate the file name for the label, video, # not the name of the data entity dataset_path = util.get_dataset_path() eval_path = util.get_eval_path() evaluation_set_name = os.path.splitext( ntpath.basename(str(image_list_file_path)))[0] evaluation_task.apply_async( (dataset_path, eval_path, classifier_id, base_image_name, evaluation_set_name, evaluation_result_name))
def validation(project_name, sliding_window_size, check_days, max_k, random_val): """ Perform validation with given parameters. Parameters ---------- project_name (str): Name of the project to read change sets. sliding_window_size (str): Number of days to include the graph. check_days (list): List of integers to check if recomendations are true or false. max_k (int): Maximum k for topk and MRR calculations. When max_k is 3, top1, top2 and top3 will be calculated, and the ranks in MRR calculations can 1, 2 and 3. random_val (bool): If True, `max_k` replacements will be selected randomly. Returns ------- list: First item of the list is the name of the experiment. Second and the following items will include accuracy and MRR for each check day. For example, returns [pig_sws365, (7, {top1:.5, top2:.7, mrr:.6}), (30, {top1:.6, top2:.9, mrr:.7})]. """ dataset_path = get_dataset_path(project_name) exp_name = get_exp_name(project_name, sws=sliding_window_size) dm = DataManager(dataset_path, None) # No need for sliding window size G = HistoryGraph(dataset_path, sliding_window_size) check_day_to_ranks = {check_day: [] for check_day in check_days} date_to_results = load_results(exp_name) for date, results in date_to_results.items(): if not results["replacements"]: # No leaving developer continue G.forward_until(date) # Update graph for leaving_dev, recommended_devs in results["replacements"].items(): if not recommended_devs: # No recommended developers continue if random_val: # Randomly select "max_k" developers other_devs = results["developers"] other_devs.remove(leaving_dev) recommended_devs = random.sample(other_devs, max_k) else: # Convert dictionary keys to list and get first "max_k" items recommended_devs = list(recommended_devs)[:max_k] leaving_dev_files = set(G.find_reachable_files(leaving_dev)) for check_day in check_days: # Get the change sets in the next days # For example, get the change sets in the next 7 days if check day is 7 change_sets = dm.get_specific_window( date + timedelta(days=1), date + timedelta(days=check_day)) rank = float("inf") # Not found yet for i, recommended_dev in enumerate(recommended_devs): recommended_dev_files = set( G.find_reachable_files(recommended_dev)) # Find the files that leaving developer can reach but recmommended # developer cannot reach target_files = leaving_dev_files - recommended_dev_files if check_modification(change_sets, recommended_dev, target_files): rank = i + 1 break # No need to check other developers check_day_to_ranks[check_day].append(rank) ret_items = [exp_name] for check_day in check_days: res = {} for k in range(1, max_k + 1): res["top{}".format(k)] = cal_accuracy( check_day_to_ranks[check_day], k) res["mrr"] = cal_mrr(check_day_to_ranks[check_day]) ret_items.append((check_day, res)) return ret_items
date_to_results[date]["balanced_or_hero_{}".format( alpha)] = G.balanced_or_hero(alpha=alpha) print_log("{} -> {} nodes\n".format(step, G.get_num_nodes()), log_path) if not G.forward_graph_one_day(): break end = datetime.now() print_log("Ended.(Time taken: {})\n".format(end - start), log_path) dump_results(experiment_name, date_to_results) print_log("Exported results to 'results/{}.pkl'".format(experiment_name), log_path) if __name__ == "__main__": # dl -> distance limit (using default (10) in graph.HistoryGraph) # nfl -> number of files limit (using default (50) in graph.HistoryGraph) # sws -> sliding window size is experiments = [] for project_name in project_list: dataset_path = get_dataset_path(project_name) for sws in sws_list: exp_name = get_exp_name(project_name, sws=sws) experiments.append((exp_name, dataset_path, sws)) # Run all in parallel using all CPUs. Parallel(n_jobs=-1, verbose=10)(run_experiment(*params) for params in experiments)
"tom tsuruhara": "tomu tsuruhara", "unknown": "rsvasude", "xiaowen147": "yun zhao", "vasudevan": "ramkrishna", "yi deng": "david deng", } derby_author_mapping = {} zookeeper_author_mapping = { "andor molnár": "andor molnar", "flavio junqueira": "flavio paiva junqueira", "fpj": "flavio paiva junqueira", "patrick hunt": "patrick d. hunt", "raúl gutiérrez segalés": "raul gutierrez segales", "raúl gutierrez s": "raul gutierrez segales", "robert evans": "robert (bobby) evans", } if __name__ == "__main__": for project in project_list: author_mapping = eval("{}_author_mapping".format(project)) # First, extract commits and generate a JSON formatted string. text = extract_change_sets(project, author_mapping) # Dump the extracted JSON formatted text dataset_path = get_dataset_path(project) with open(dataset_path, "w", encoding="utf8") as f: f.write(text)
def scalability_experiment_rq2(project_name): """ First, find leaving developers, then create a graph (with default parameters) and find replacements for leaving developers. At the same time keep some statistics, and return them. Parameters ---------- project_name (str): Name of the project. Returns ------- tuple: Tuple of experiment name, node statistics, average number of nodes, edge statistics, average number of edges, average time taken and total number of recommended replacements. """ experiment_name = get_exp_name(project_name) dataset_path = get_dataset_path(project_name) G = HistoryGraph(dataset_path) date_to_leaving_developers = find_leaving_developers(G) # Start iterations num_leaving_developers = 0 total_num_nodes = 0 total_num_edges = 0 node_stat = {"Developer": 0, "Issue": 0, "ChangeSet": 0, "File": 0} edge_stat = {"commit": 0, "include": 0, "link": 0} time_taken = 0 for date, leaving_developers in date_to_leaving_developers.items(): G.forward_until(date) for leaving_developer in leaving_developers: num_leaving_developers += 1 for node_type in node_stat: node_stat[node_type] += len(G._filter_nodes_by_kind(node_type)) for edge_type in edge_stat: edge_stat[edge_type] += len(G._filter_edges_by_kind(edge_type)) total_num_nodes += G.get_num_nodes() total_num_edges += G.get_num_edges() t_start = perf_counter() G.find_replacement(leaving_developer) t_end = perf_counter() time_taken += t_end - t_start for node_type in node_stat: node_stat[node_type] = round(node_stat[node_type] / num_leaving_developers) for edge_type in edge_stat: edge_stat[edge_type] = round(edge_stat[edge_type] / num_leaving_developers) avg_num_nodes = round(total_num_nodes / num_leaving_developers) avg_num_edges = round(total_num_edges / num_leaving_developers) avg_time_taken = time_taken / num_leaving_developers return ( experiment_name, node_stat, avg_num_nodes, edge_stat, avg_num_edges, avg_time_taken, num_leaving_developers, )
def scalability_experiment_rq1_rq3(project_name, method_name): """ First, create a graph (with default parameters) for the initial window, then slide the window day by day. While sliding, run the given method (`method_name`) for each day and keep some statistics, and return them. This method is for RQ1 and RQ3 because the given method can't have any parameters in this setup. Parameters ---------- project_name (str): Name of the project. method name (str): Name of the method to run in the experiment. It have to match one of the methods defined in graph.HistoryGraph. Also, the given method cannot have any paramaters. For example, "get_connectors". Returns ------- tuple: Tuple of experiment name, node statistics, average number of nodes, edge statistics, average number of edges, average time taken and total number of iterations. """ experiment_name = get_exp_name(project_name) dataset_path = get_dataset_path(project_name) G = HistoryGraph(dataset_path) # Start iterations num_iters = 0 total_num_nodes = 0 total_num_edges = 0 node_stat = {"Developer": 0, "Issue": 0, "ChangeSet": 0, "File": 0} edge_stat = {"commit": 0, "include": 0, "link": 0} time_taken = 0 while True: num_iters += 1 for node_type in node_stat: node_stat[node_type] += len(G._filter_nodes_by_kind(node_type)) for edge_type in edge_stat: edge_stat[edge_type] += len(G._filter_edges_by_kind(edge_type)) total_num_nodes += G.get_num_nodes() total_num_edges += G.get_num_edges() t_start = perf_counter() eval("G.{}()".format(method_name)) t_end = perf_counter() time_taken += t_end - t_start if not G.forward_graph_one_day(): break for node_type in node_stat: node_stat[node_type] = round(node_stat[node_type] / num_iters) for edge_type in edge_stat: edge_stat[edge_type] = round(edge_stat[edge_type] / num_iters) avg_num_nodes = round(total_num_nodes / num_iters) avg_num_edges = round(total_num_edges / num_iters) avg_time_taken = time_taken / num_iters return ( experiment_name, node_stat, avg_num_nodes, edge_stat, avg_num_edges, avg_time_taken, num_iters, )