Example #1
0
def generate_date_to_top_commenters(project_name, sws):
    """
    Generate a mapping from date to number of comment made until that date.

    Large change sets are not exluded because the comments made to the issues related
    to the large change sets still exist.

    Parameters
    ----------
    project_name (str):
        Name of the project

    sws (int):
        Sliding_window_size, in other words number of days to include the graph.

    Returns
    --------
    dict:
        Mapping from date to top commenters and their numbers of comments in the sliding
        window ending that date.
    """
    issue_to_commenters = generate_issue_to_commenters(project_name)
    data_manager = DataManager(get_dataset_path(project_name), sws)

    # Get initial change sets to add and remove
    change_sets_add = data_manager.get_initial_window()
    change_sets_remove = {}
    top_commenters = defaultdict(lambda: 0)

    date_to_top_commenters = {}
    while True:
        # Add change sets
        for change_set in change_sets_add:
            for issue_id in change_set.issues:
                for commenter in issue_to_commenters.get(issue_id, []):
                    top_commenters[commenter] += 1

        # Remove change sets
        for change_set in change_sets_remove:
            for issue_id in change_set.issues:
                for commenter in issue_to_commenters.get(issue_id, []):
                    top_commenters[commenter] -= 1
                    if top_commenters[commenter] <= 0:
                        del top_commenters[commenter]

        date = data_manager.get_last_included_date()
        date_to_top_commenters[date] = sort_dict(top_commenters,
                                                 by_value=True,
                                                 reverse=True)

        try:
            change_sets_add, change_sets_remove = data_manager.forward_one_day(
            )
        except SlidingNotPossible:
            break

    return date_to_top_commenters
Example #2
0
def generate_date_to_top_committers(project_name, sws):
    """
    Generate a mapping from date to number of commits made until that date.

    TODO:
    Large change sets can be excluded.

    Parameters
    ----------
    project_name (str):
        Name of the project.

    sws (int):
        Sliding_window_size.

    Returns
    --------
    dict:
        Mapping from date to top committers and their numbers of commits in the sliding
        window ending that date.
    """

    data_manager = DataManager(get_dataset_path(project_name), sws)

    # Get initial change sets to add and remove
    change_sets_add = data_manager.get_initial_window()
    change_sets_remove = {}
    top_committers = defaultdict(lambda: 0)

    date_to_top_committers = {}
    while True:
        # Add change sets
        for change_set in change_sets_add:
            top_committers[change_set.author] += 1

        # Remove change sets
        for change_set in change_sets_remove:
            author = change_set.author
            top_committers[author] -= 1
            if top_committers[author] <= 0:
                del top_committers[author]

        date = data_manager.get_last_included_date()
        date_to_top_committers[date] = sort_dict(
            top_committers, by_value=True, reverse=True
        )

        try:
            change_sets_add, change_sets_remove = data_manager.forward_one_day()
        except SlidingNotPossible:
            break

    return date_to_top_committers
def launch_training_docker_task(base_image_name,
                                result_image_name,
                                classifier_id,
                                train_set_name,
                                epoch,
                                weights=None):
    if weights is None:
        weights = '/VGG_CNN_M_1024.v2.caffemodel'
    task_id = str(classifier_id) + '-' + str(random.getrandbits(32))
    print 'classifier id %s, train set %s, epoch %s, weight %s ' % (
        str(classifier_id), str(train_set_name), str(epoch), str(weights))
    dataset_path = util.get_dataset_path()
    train_task.apply_async((base_image_name, result_image_name, dataset_path,
                            classifier_id, train_set_name, epoch, weights),
                           task_id=task_id)
    return task_id
Example #4
0
def dataset_details_after_preprocess():
    """
    Generate statistics after dataset preprocessing for each project.
    """

    print("\n*** Dataset Details After Preprocessing ***\n")
    print(
        "Project        # Developers     # CS     # CS > 10        # CS > 50")
    for project_name in project_list:
        dataset_path = get_dataset_path(project_name)
        dm = DataManager(dataset_path, None)
        developers = set()
        nums_cs = 0
        nums_cs_10 = 0
        nums_cs_50 = 0
        add_or_modify = set(["MODIFY", "ADD"])
        for date, change_sets in dm._date_to_change_sets.items():
            for cs in change_sets:
                developers.add(cs.author)
                files_add_modify = []
                for cc in cs.code_changes:
                    if cc.change_type in add_or_modify:
                        files_add_modify.append(cc.file_path)

                # Increase counters
                nums_cs += 1
                if len(files_add_modify) > 10:
                    nums_cs_10 += 1
                if len(files_add_modify) > 50:
                    nums_cs_50 += 1

        print("{:<15}{}\t\t{}\t{:>5}({:.2f})\t{:>5}({:.2f})".format(
            project_name,
            len(developers),
            nums_cs,
            nums_cs_10,
            100 * nums_cs_10 / nums_cs,
            nums_cs_50,
            100 * nums_cs_50 / nums_cs,
        ))
    print()
Example #5
0
def leaving_developers_table():
    """
    Generate the number of leaving developers for each project.
    """

    print("\n*** Number of Leaving Developers ***\n")
    print("Absence Limit ",
          ("{:<15}" * len(project_list)).format(*project_list))
    for absence_limit in sws_list:
        print("{:<15}".format(absence_limit), end="")
        for project_name in project_list:
            dataset_path = get_dataset_path(project_name)
            G = HistoryGraph(dataset_path, sliding_window_size=absence_limit)
            date_to_leaving_developers = find_leaving_developers(G)
            leaving_developers = [
                dev for devs in date_to_leaving_developers.values()
                for dev in devs
            ]
            print("{:<15}".format(len(leaving_developers)), end="")
        print()
    print()
Example #6
0
def average_number_of_developers():
    """
    Generate the average number of developers in the graph for each project.
    """

    print("\n*** Average Number of Developer ***\n")
    print("{:<15}".format("SWS"), end="")
    print(("{:<15}" * len(project_list)).format(*project_list))
    for sws in sws_list:
        print("{:<15}".format(sws), end="")
        for project_name in project_list:
            dataset_path = get_dataset_path(project_name)
            G = HistoryGraph(dataset_path, sliding_window_size=sws)
            dev_nums = []
            while True:
                devs = G.get_developers()
                dev_nums.append(len(devs))
                if not G.forward_graph_one_day():
                    break
            avg_dev_num = sum(dev_nums) / len(dev_nums)
            print("{:<15.2f}".format(avg_dev_num), end="")
        print()
    print()
def create_evaluation(classifier_id, name, video_list):
    classifier = Classifier.query.filter(
        Classifier.id == classifier_id).first()
    if not classifier:
        print 'classifier not exist'
        return None
    label_list = classifier.labels.split(',')
    evaluation_set = EvaluationSet(name=name)

    for video_id in video_list:
        video = Video.query.filter(Video.id == video_id).first()
        evaluation_set.videos.append(video)

    classifier.evaluation_sets.append(evaluation_set)
    db.session.add(evaluation_set)
    db.session.commit()
    print 'created evaluation set with name %s id %s ' % (
        str(name), str(evaluation_set.id))
    evaluation_result_name = str(evaluation_set.id)

    base_image_name = util.get_classifier_image_name(classifier.name,
                                                     classifier.id)

    # prepare label data
    image_list_file_path, label_list_file_path, label_name_file_path = turkic_replacement.dump_image_and_label_files(
        video_list, label_list, remove_none_frame=True)

    # note: this evaluation set name is the unique name to indicate the file name for the label, video,
    # not the name of the data entity

    dataset_path = util.get_dataset_path()
    eval_path = util.get_eval_path()
    evaluation_set_name = os.path.splitext(
        ntpath.basename(str(image_list_file_path)))[0]
    evaluation_task.apply_async(
        (dataset_path, eval_path, classifier_id, base_image_name,
         evaluation_set_name, evaluation_result_name))
Example #8
0
def validation(project_name, sliding_window_size, check_days, max_k,
               random_val):
    """
    Perform validation with given parameters.

    Parameters
    ----------
    project_name (str):
        Name of the project to read change sets.

    sliding_window_size (str):
        Number of days to include the graph.

    check_days (list):
        List of integers to check if recomendations are true or false.

    max_k (int):
        Maximum k for topk and MRR calculations. When max_k is 3, top1, top2 and top3
        will be calculated, and the ranks in MRR calculations can 1, 2 and 3.

    random_val (bool):
        If True, `max_k` replacements will be selected randomly.

    Returns
    -------
    list:
        First item of the list is the name of the experiment. Second and the following
        items will include accuracy and MRR for each check day. For example, returns
        [pig_sws365, (7, {top1:.5, top2:.7, mrr:.6}), (30, {top1:.6, top2:.9, mrr:.7})].
    """
    dataset_path = get_dataset_path(project_name)
    exp_name = get_exp_name(project_name, sws=sliding_window_size)

    dm = DataManager(dataset_path, None)  # No need for sliding window size
    G = HistoryGraph(dataset_path, sliding_window_size)

    check_day_to_ranks = {check_day: [] for check_day in check_days}
    date_to_results = load_results(exp_name)
    for date, results in date_to_results.items():
        if not results["replacements"]:  # No leaving developer
            continue

        G.forward_until(date)  # Update graph

        for leaving_dev, recommended_devs in results["replacements"].items():
            if not recommended_devs:  # No recommended developers
                continue

            if random_val:  # Randomly select "max_k" developers
                other_devs = results["developers"]
                other_devs.remove(leaving_dev)
                recommended_devs = random.sample(other_devs, max_k)
            else:  # Convert dictionary keys to list and get first "max_k" items
                recommended_devs = list(recommended_devs)[:max_k]

            leaving_dev_files = set(G.find_reachable_files(leaving_dev))

            for check_day in check_days:
                # Get the change sets in the next days
                # For example, get the change sets in the next 7 days if check day is 7
                change_sets = dm.get_specific_window(
                    date + timedelta(days=1), date + timedelta(days=check_day))
                rank = float("inf")  # Not found yet
                for i, recommended_dev in enumerate(recommended_devs):
                    recommended_dev_files = set(
                        G.find_reachable_files(recommended_dev))

                    # Find the files that leaving developer can reach but recmommended
                    # developer cannot reach
                    target_files = leaving_dev_files - recommended_dev_files

                    if check_modification(change_sets, recommended_dev,
                                          target_files):
                        rank = i + 1
                        break  # No need to check other developers

                check_day_to_ranks[check_day].append(rank)

    ret_items = [exp_name]

    for check_day in check_days:
        res = {}
        for k in range(1, max_k + 1):
            res["top{}".format(k)] = cal_accuracy(
                check_day_to_ranks[check_day], k)

        res["mrr"] = cal_mrr(check_day_to_ranks[check_day])

        ret_items.append((check_day, res))
    return ret_items
Example #9
0
            date_to_results[date]["balanced_or_hero_{}".format(
                alpha)] = G.balanced_or_hero(alpha=alpha)

        print_log("{} -> {} nodes\n".format(step, G.get_num_nodes()), log_path)

        if not G.forward_graph_one_day():
            break

    end = datetime.now()
    print_log("Ended.(Time taken: {})\n".format(end - start), log_path)

    dump_results(experiment_name, date_to_results)
    print_log("Exported results to 'results/{}.pkl'".format(experiment_name),
              log_path)


if __name__ == "__main__":
    # dl  -> distance limit (using default (10) in graph.HistoryGraph)
    # nfl -> number of files limit (using default (50) in graph.HistoryGraph)
    # sws -> sliding window size is
    experiments = []
    for project_name in project_list:
        dataset_path = get_dataset_path(project_name)
        for sws in sws_list:
            exp_name = get_exp_name(project_name, sws=sws)
            experiments.append((exp_name, dataset_path, sws))

    # Run all in parallel using all CPUs.
    Parallel(n_jobs=-1,
             verbose=10)(run_experiment(*params) for params in experiments)
Example #10
0
    "tom tsuruhara": "tomu tsuruhara",
    "unknown": "rsvasude",
    "xiaowen147": "yun zhao",
    "vasudevan": "ramkrishna",
    "yi deng": "david deng",
}

derby_author_mapping = {}

zookeeper_author_mapping = {
    "andor molnár": "andor molnar",
    "flavio junqueira": "flavio paiva junqueira",
    "fpj": "flavio paiva junqueira",
    "patrick hunt": "patrick d. hunt",
    "raúl gutiérrez segalés": "raul gutierrez segales",
    "raúl gutierrez s": "raul gutierrez segales",
    "robert evans": "robert (bobby) evans",
}

if __name__ == "__main__":
    for project in project_list:
        author_mapping = eval("{}_author_mapping".format(project))

        # First, extract commits and generate a JSON formatted string.
        text = extract_change_sets(project, author_mapping)

        # Dump the extracted JSON formatted text
        dataset_path = get_dataset_path(project)
        with open(dataset_path, "w", encoding="utf8") as f:
            f.write(text)
Example #11
0
def scalability_experiment_rq2(project_name):
    """
    First, find leaving developers, then create a graph (with default parameters) and
    find replacements for leaving developers. At the same time keep some statistics,
    and return them.

    Parameters
    ----------
    project_name (str):
        Name of the project.

    Returns
    -------
    tuple:
        Tuple of experiment name, node statistics, average number of nodes, edge
        statistics, average number of edges, average time taken and total number
        of recommended replacements.
    """
    experiment_name = get_exp_name(project_name)
    dataset_path = get_dataset_path(project_name)

    G = HistoryGraph(dataset_path)
    date_to_leaving_developers = find_leaving_developers(G)

    # Start iterations
    num_leaving_developers = 0
    total_num_nodes = 0
    total_num_edges = 0
    node_stat = {"Developer": 0, "Issue": 0, "ChangeSet": 0, "File": 0}
    edge_stat = {"commit": 0, "include": 0, "link": 0}
    time_taken = 0
    for date, leaving_developers in date_to_leaving_developers.items():
        G.forward_until(date)
        for leaving_developer in leaving_developers:
            num_leaving_developers += 1

            for node_type in node_stat:
                node_stat[node_type] += len(G._filter_nodes_by_kind(node_type))

            for edge_type in edge_stat:
                edge_stat[edge_type] += len(G._filter_edges_by_kind(edge_type))

            total_num_nodes += G.get_num_nodes()
            total_num_edges += G.get_num_edges()

            t_start = perf_counter()
            G.find_replacement(leaving_developer)
            t_end = perf_counter()
            time_taken += t_end - t_start

    for node_type in node_stat:
        node_stat[node_type] = round(node_stat[node_type] /
                                     num_leaving_developers)

    for edge_type in edge_stat:
        edge_stat[edge_type] = round(edge_stat[edge_type] /
                                     num_leaving_developers)

    avg_num_nodes = round(total_num_nodes / num_leaving_developers)
    avg_num_edges = round(total_num_edges / num_leaving_developers)
    avg_time_taken = time_taken / num_leaving_developers

    return (
        experiment_name,
        node_stat,
        avg_num_nodes,
        edge_stat,
        avg_num_edges,
        avg_time_taken,
        num_leaving_developers,
    )
Example #12
0
def scalability_experiment_rq1_rq3(project_name, method_name):
    """
    First, create a graph (with default parameters) for the initial window, then slide
    the window day by day. While sliding, run the given method (`method_name`) for each
    day and keep some statistics, and return them.

    This method is for RQ1 and RQ3 because the given method can't have any
    parameters in this setup.

    Parameters
    ----------
    project_name (str):
        Name of the project.

    method name (str):
        Name of the method to run in the experiment. It have to match one of the
        methods defined in graph.HistoryGraph. Also, the given method cannot have
        any paramaters. For example, "get_connectors".

    Returns
    -------
    tuple:
        Tuple of experiment name, node statistics, average number of nodes, edge
        statistics, average number of edges, average time taken and total number
        of iterations.
    """
    experiment_name = get_exp_name(project_name)
    dataset_path = get_dataset_path(project_name)

    G = HistoryGraph(dataset_path)

    # Start iterations
    num_iters = 0
    total_num_nodes = 0
    total_num_edges = 0
    node_stat = {"Developer": 0, "Issue": 0, "ChangeSet": 0, "File": 0}
    edge_stat = {"commit": 0, "include": 0, "link": 0}
    time_taken = 0
    while True:
        num_iters += 1
        for node_type in node_stat:
            node_stat[node_type] += len(G._filter_nodes_by_kind(node_type))

        for edge_type in edge_stat:
            edge_stat[edge_type] += len(G._filter_edges_by_kind(edge_type))

        total_num_nodes += G.get_num_nodes()
        total_num_edges += G.get_num_edges()

        t_start = perf_counter()
        eval("G.{}()".format(method_name))
        t_end = perf_counter()
        time_taken += t_end - t_start

        if not G.forward_graph_one_day():
            break

    for node_type in node_stat:
        node_stat[node_type] = round(node_stat[node_type] / num_iters)

    for edge_type in edge_stat:
        edge_stat[edge_type] = round(edge_stat[edge_type] / num_iters)

    avg_num_nodes = round(total_num_nodes / num_iters)
    avg_num_edges = round(total_num_edges / num_iters)
    avg_time_taken = time_taken / num_iters

    return (
        experiment_name,
        node_stat,
        avg_num_nodes,
        edge_stat,
        avg_num_edges,
        avg_time_taken,
        num_iters,
    )