Exemple #1
0
def check_property_null(repo_path):
    null_property_dic = {
        "hasDependencies": set(),
        "packageManager": set(),
        "packageName": set(),
        "repository": set(),
        "requirements": set(),
    }
    for repo_index, repo_file in enumerate(os.listdir(repo_path)):
        if os.path.splitext(repo_file)[1] != ".json":
            continue
        json = read_json_file(os.path.join(repo_path, repo_file))
        try:
            dependencyGraphManifest_nodes = json["data"]["repository"][
                "dependencyGraphManifests"]["nodes"]
            for node in dependencyGraphManifest_nodes:
                dependency_nodes = node["dependencies"]["nodes"]
                for node2 in dependency_nodes:
                    for key, val in node2.items():
                        if val is None:
                            null_property_dic[key].add(
                                json["data"]["repository"]["nameWithOwner"])
        except Exception as e:
            print(e)
            print("exception at: " + repo_path + repo_file)
    return null_property_dic
Exemple #2
0
 def get_arxivId_paperTitle_dic(self):
     json = read_json_file(
         r"C:\Disk_Dev\Repository\github-KG\github-KG-python\tx_data\resource\paperswithcode\papers-with-abstracts.json"
     )
     res = {}
     for item in json:
         res[item["arxiv_id"]] = item["title"]
     return res
Exemple #3
0
def load_repo_dir_jsons(repo_dir_path):
    res = OrderedDict()
    for repo_index, repo_file in enumerate(os.listdir(repo_dir_path)):
        if os.path.splitext(repo_file)[1] != ".json":
            continue
        json = read_json_file(repo_dir_path + "/" + repo_file)
        res[repo_file] = json
    return res
Exemple #4
0
def load_pwc_json(dir_path):
    evaluation_tables = read_json_file(
        os.path.join(dir_path, 'evaluation-tables.json'))
    links_between_papers_and_code = read_json_file(
        os.path.join(dir_path, 'links-between-papers-and-code.json'))
    methods = read_json_file(os.path.join(dir_path, 'methods.json'))
    papers_with_abstracts = read_json_file(
        os.path.join(dir_path, 'papers-with-abstracts.json'))
    datasets = None
    if os.path.exists(os.path.join(dir_path, 'datasets.json')):
        datasets = read_json_file(os.path.join(dir_path, 'datasets.json'))
    return {
        "evaluation-tables": evaluation_tables,
        "links_between_papers_and_code": links_between_papers_and_code,
        "methods": methods,
        "papers_with_abstracts": papers_with_abstracts,
        'datasets': datasets
    }
Exemple #5
0
def get_exist_repo_list_by_info(repo_dir_path):
    res_list = []
    for repo_index, repo_file in enumerate(os.listdir(repo_dir_path)):
        ext = os.path.splitext(repo_file)[1]
        if ext == ".json" or ext == '.md':
            json_dic = read_json_file(os.path.join(repo_dir_path, repo_file))
            nameWithOwner = jsonpath.jsonpath(
                json_dic, "$.data.repository.nameWithOwner")[0]
            res_list.append(nameWithOwner)
    return res_list
Exemple #6
0
def split_portrait_train_test_by_count(kwargs):
    # 获取要服务的用户
    df_to_reco_csv = pd.read_csv(kwargs.get('to_reco_csv_file'))
    relation_row_list = df_to_reco_csv.iloc[:, 0:2].values.tolist()
    payload_repo_dic = defaultdict(list)
    for row in relation_row_list:
        payload_repo_dic[row[0]].append(row[1])
    payload_repo_set = payload_repo_dic.keys()
    # 读取用户画像信息(实验用)
    total_repo_portrait_json_dic = read_json_file(
        kwargs.get('repo_portrait_json_file'))
    # 保存训练集和测试集,便于后面计算指标
    repo_portrait_train_test_dic = defaultdict(dict)
    for i, nameWithOwner in enumerate(payload_repo_set):
        # 划分训练集测试集,固定选择个数法
        nameWithOwner_portrait_info = total_repo_portrait_json_dic[
            nameWithOwner]
        repo_portrait_train_test_dic[nameWithOwner] = defaultdict(dict)
        repo_portrait_train_test_dic[nameWithOwner]['train'] = defaultdict(
            list)
        for entity in kwargs.get('total_entities'):
            # 画像数据中不存在该实体数据默认为空列表
            entity_list = nameWithOwner_portrait_info.get(entity, [])
            # 希望初始化 key:[],不然list为空,不能用defaultdict(list)因为默认为空
            repo_portrait_train_test_dic[nameWithOwner]['train'][entity] = []
            repo_portrait_train_test_dic[nameWithOwner]['test'][entity] = []
            # 如果该实体不需要作为训练集,则全当做测试集,否则划分之
            if entity not in kwargs.get('payload_entities'):
                repo_portrait_train_test_dic[nameWithOwner]['test'][
                    entity] = entity_list
                continue
            # 随机选 init_input_count 个对象作为训练集,不足返回空列表
            # 这里target_obj_list的来源是固定repo_portrait_json文件,所以也可以不sorted
            train_entity_list, test_entity_list = split_list_by_sub_count(
                sorted(entity_list), kwargs.get('init_input_count'),
                kwargs.get('split_seed'))
            # 测试集保留list就行了
            repo_portrait_train_test_dic[nameWithOwner]['test'][
                entity] = test_entity_list
            # 训练集用dic_list,保存target的一些属性,比如tf
            entity_portrait_dic_list = []
            for entityKey in train_entity_list:
                train_target_dic = {
                    'key': entityKey,
                    'tf': 1 / len(train_entity_list)
                }
                entity_portrait_dic_list.append(train_target_dic)
            repo_portrait_train_test_dic[nameWithOwner]['train'][
                entity] = entity_portrait_dic_list
    # dump 训练集测试集
    # to_reco_csv_file_name = kwargs.get('to_reco_csv_file').split('\\')[-1:][0]
    # out_dir = os.path.join(kwargs.get('out_dir'), os.path.splitext(to_reco_csv_file_name)[0], 'init_' + str(kwargs.get('init_input_count')))
    # dump_repo_train_test_path = os.path.join(out_dir, 'repo_portrait_init_train_test' + '_' + str(uuid1()) + '.json')
    # write_json_file(out_dir, dump_repo_train_test_path, repo_portrait_train_test_dic)
    return repo_portrait_train_test_dic
Exemple #7
0
def get_data_one_topic_repo(topic_path):
    res = set()
    for page_index, page_file in enumerate(os.listdir(topic_path)):
        json = read_json_file(os.path.join(topic_path, page_file))
        # 预处理,忽略fork的仓库 private仓库
        for item in json["items"]:
            exclude = item["size"] == 0 or item["fork"] or item["private"]
            if exclude is True:
                continue
            res.add(item["full_name"])
    return res
Exemple #8
0
def handle_raw_json():
    json_dic = read_json_file(
        r"C:\Disk_Data\Small_Data\Neo4j\tensorflow-$-tensorflow.json")
    language_edges = jsonpath.jsonpath(json_dic,
                                       "$.data.repository.languages.edges[*]")
    language_nodes = jsonpath.jsonpath(json_dic,
                                       "$.data.repository.languages.nodes[*]")
    for i in range(len(language_nodes)):
        language_nodes[i]["size"] = language_edges[i]["size"]
    json_dic["data"]["repository"]["languages"]["nodes"] = language_nodes
    json_str = json.dumps(json_dic)
    pass
Exemple #9
0
def check_errors_repos(repo_path):
    error_repo_dic = {}
    # error_repo_list = []
    for repo_index, repo_file in enumerate(os.listdir(repo_path)):
        if os.path.splitext(repo_file)[1] != ".json":
            continue
        json_dic = read_json_file(os.path.join(repo_path, repo_file))
        errors = jsonpath.jsonpath(json_dic, "$.errors")
        if errors is not False:
            error_repo_dic[repo_file] = errors
            # error_repo_list.append(repo_file)
    # return error_repo_list
    return error_repo_dic
Exemple #10
0
def get_needless_repo_set(repo_dir, raw_file_list, min_dependency_count):
    '''
    1、不同名的仓库,内容完全一样,相当于就换了个名字,对于这种重复只保留一份repo
    2、没有依赖的repo,这样的独立节点对推荐没用

    :param min_dependency_count:
    :param repo_dir:
    :return:
    '''
    res_dic = defaultdict(list)
    dup_dic = defaultdict(list)
    no_dup_nwo_set = set()
    # res_set = set()
    dup_set = set()
    no_dependency_set = set()
    less_dependency_set = set()
    for i, repo_file in enumerate(raw_file_list):
        if os.path.splitext(repo_file)[1] != ".json":
            continue
        json_dic = read_json_file(os.path.join(repo_dir, repo_file))
        nameWithOwner = jsonpath.jsonpath(json_dic,
                                          "$.data.repository.nameWithOwner")[0]
        if nameWithOwner in no_dup_nwo_set:
            dup_set.add(repo_file)
            continue
        no_dup_nwo_set.add(nameWithOwner)
        # 过滤没有依赖的repo
        dgm_count = jsonpath.jsonpath(
            json_dic,
            "$.data.repository.dependencyGraphManifests.totalCount")[0]
        if dgm_count == 0:
            no_dependency_set.add(repo_file)
            continue
        packageName_list = jsonpath.jsonpath(
            json_dic, "$.data.repository.dependencyGraphManifests.nodes["
            "*].dependencies.nodes[*].packageName")
        if packageName_list == False:
            no_dependency_set.add(repo_file)
            continue
        # 过滤依赖数量不够的repo
        if len(set(packageName_list)) < min_dependency_count:
            less_dependency_set.add(repo_file)
            continue
    # res_set.add(repo_file)
    #         res_dic[nameWithOwner].append(repo_file)
    #     for key, val in res_dic.items():
    #         if len(val) > 1:
    #             dup_dic[key] = val
    #     res = list(res_dic.keys())
    #     return res
    return dup_set, no_dependency_set, less_dependency_set
Exemple #11
0
def get_file_name_not_match_nameWithOwner(repo_dir):
    res = []
    should_file_name_list = []
    for i, repo_file in enumerate(os.listdir(repo_dir)):
        if os.path.splitext(repo_file)[1] != '.json':
            continue
        json_dic = read_json_file(os.path.join(repo_dir, repo_file))
        nameWithOwner = jsonpath.jsonpath(json_dic,
                                          '$.data.repository.nameWithOwner')[0]
        should_file_name = nameWithOwner.replace('/', '-$-') + '.json'
        if should_file_name != repo_file:
            res.append((repo_file, should_file_name))
    #         should_file_name_list.append(should_file_name)
    # should_file_name_set = set(should_file_name_list)
    return res
Exemple #12
0
def get_data_topic_repo_set(topic_repo_path):
    res_list = []
    for topic_index, topic_dir in enumerate(os.listdir(topic_repo_path)):
        for page_index, page_file in enumerate(
                os.listdir(os.path.join(topic_repo_path, topic_dir))):
            if os.path.splitext(page_file)[1] != ".json":
                continue
            json = read_json_file(
                os.path.join(topic_repo_path, topic_dir, page_file))
            # 预处理,忽略fork的仓库 private仓库
            for item in json["items"]:
                exclude = item["size"] == 0 or item["fork"] or item["private"]
                if exclude is True:
                    continue
                res_list.append(item["full_name"])
    return set(res_list)
Exemple #13
0
def construct_label_data(recommender_result_json_file_dic, out_csv):
    """

    :param out_csv:
    :param recommender_result_json_file_dic: key:推荐器名称 val:结果文件
    :return:
    """
    recommender_result_json_dic = {}
    for recommender in recommender_result_json_file_dic.keys():
        recommender_result_json_dic[recommender] = read_json_file(
            recommender_result_json_file_dic[recommender])
    # 这个字典需要增量更新
    repo_package_row_dic = defaultdict(dict)
    for recommender, reco_result_json_dic in recommender_result_json_dic.items(
    ):
        # for repo_i, record_list in reco_result_json_dic.items():
        for repo_i, record_dic in reco_result_json_dic.items():
            # for record in record_list:
            for record in record_dic['package']:
                # package_j = record['nameWithManager']
                package_j = record['key']
                if repo_package_row_dic[repo_i].get(package_j) is None:
                    repo_package_row_dic[repo_i][package_j] = defaultdict(dict)
                    # 默认所有 entity 的 score为 ?
                    for key in recommender_result_json_file_dic.keys():
                        repo_package_row_dic[repo_i][package_j][
                            key] = np.NAN  # '?'
                repo_package_row_dic[repo_i][package_j][recommender] = record[
                    'score']
                if record.get('repoDegree') is None:
                    print(1)
                repo_package_row_dic[repo_i][package_j]['repoDegree'] = record[
                    'repoDegree']
                repo_package_row_dic[repo_i][package_j]['hit'] = record['hit']
    # 把嵌套字典展平
    res_row_list = []
    for repo_i, val1 in repo_package_row_dic.items():
        for package_j, val2 in val1.items():
            # repo 和 package 列
            res_row_dic = {'repo_i': repo_i, 'package_j': package_j}
            # 其他属性列
            for key in val2.keys():
                res_row_dic[key] = val2[key]
            res_row_list.append(res_row_dic)
    pd.DataFrame(res_row_list).to_csv(out_csv, index=False)
    pass
Exemple #14
0
def stat_dependencyGraphManifests(repo_path):
    res = {
        "has_dfm": 0,
        "no_dfm": 0,
    }
    for repo_index, repo_file in enumerate(os.listdir(repo_path)):
        if os.path.splitext(repo_file)[1] != ".json":
            continue
        json_dic = read_json_file(os.path.join(repo_path, repo_file))
        dependencyGraphManifests_count = jsonpath.jsonpath(
            json_dic,
            "$.data.repository.dependencyGraphManifests.totalCount")[0]
        if dependencyGraphManifests_count == 0:
            res["no_dfm"] += 1
        else:
            res["has_dfm"] += 1
    return res
Exemple #15
0
def repo_property():
    json_dic = read_json_file(
        r"C:\Disk_Data\Small_Data\Neo4j\tensorflow-$-tensorflow.json")
    # count = jsonpath.jsonpath(json_dic, "$.data.repository.dependencyGraphManifests.totalCount")[0]
    packageName_list = jsonpath.jsonpath(
        json_dic,
        "$.data.repository.dependencyGraphManifests.nodes[*].dependencies.nodes[*].packageName"
    )
    # tensorflow有两个"repository": null
    repository_list = jsonpath.jsonpath(
        json_dic,
        "$.data.repository.dependencyGraphManifests.nodes[*].dependencies.nodes[*].repository.nameWithOwner"
    )
    topic_list = jsonpath.jsonpath(
        json_dic, "$.data.repository.repositoryTopics.nodes[*].topic.name")
    packageName_set = set(packageName_list)

    repository_set = set(repository_list)
Exemple #16
0
def stat_dependency_count_dic(repo_dir, nameWithOwner, train_package_set):
    repo_file = nameWithOwner.replace('/', '-$-') + '.json'
    file_path = os.path.join(repo_dir, repo_file)
    json_dic = read_json_file(file_path)
    ground_truth_dependency_nodes_list = jsonpath.jsonpath(
        json_dic,
        "$.data.repository.dependencyGraphManifests.nodes[*].dependencies.nodes[*]"
    )
    depended_count_dic = defaultdict(int)
    train_package_depended_count = 0
    for node in ground_truth_dependency_nodes_list:
        nameWithManager = re.sub(
            re.compile(r'\s+'), '',
            node.get("packageManager") + "/" + node.get("packageName"))
        if nameWithManager in train_package_set:
            train_package_depended_count += 1
            depended_count_dic[nameWithManager] += 1
    return depended_count_dic, train_package_depended_count
Exemple #17
0
def check_multi_page_100(repo_path):
    over_100_dic = {
        "dependencyGraphManifests_count": set(),
        "max_dependency_count": set(),
        "language_count": set(),
        "topic_count": set()
    }
    for repo_index, repo_file in enumerate(os.listdir(repo_path)):
        if os.path.splitext(repo_file)[1] != ".json":
            continue
        json = read_json_file(os.path.join(repo_path, repo_file))
        try:
            repo = json["data"]["repository"]
            repoName = repo["nameWithOwner"]
            exclude = repo["isEmpty"] or repo["isFork"] or repo[
                "isLocked"] or repo["isPrivate"]
            if exclude:
                print("should be excluded: " + repo_path + repo_file)
            dependencyGraphManifests_count = repo["dependencyGraphManifests"][
                "totalCount"]
            if dependencyGraphManifests_count > 100:
                over_100_dic["dependencyGraphManifests_count"].add(repoName)
            max_dependency_count = 0
            for node in repo["dependencyGraphManifests"]["nodes"]:
                dependency_count = node["dependencies"]["totalCount"]
                if dependency_count > max_dependency_count:
                    max_dependency_count = dependency_count
            if max_dependency_count > 100:
                over_100_dic["max_dependency_count"].add(repoName)
            language_count = repo["languages"]["totalCount"]
            if language_count > 100:
                over_100_dic["language_count"].add(repoName)
            topic_count = repo["repositoryTopics"]["totalCount"]
            if topic_count > 100:
                over_100_dic["topic_count"].add(repoName)
        except Exception as e:
            print(e)
            print("exception at: " + repo_path + repo_file)
    return over_100_dic
Exemple #18
0
def stat_topic_set(repo_dir_path):
    topic_list = []
    for repo_index, repo_file in enumerate(os.listdir(repo_dir_path)):
        if os.path.splitext(repo_file)[1] != ".json":
            continue
        json_dic = read_json_file(repo_dir_path + "/" + repo_file)
        dependencyGraphManifests_count = jsonpath.jsonpath(
            json_dic,
            "$.data.repository.dependencyGraphManifests.totalCount")[0]
        if dependencyGraphManifests_count == 0:
            continue
        repo_topic_list = jsonpath.jsonpath(
            json_dic, "$.data.repository.repositoryTopics.nodes[*].topic.name")
        if repo_topic_list is False:
            continue
        topic_list.extend(repo_topic_list)
    topic_set = set(topic_list)
    sorted_topic_set = list(topic_set)
    sorted_topic_set.sort()
    df = pd.DataFrame(columns=["topic"], data=sorted_topic_set)
    df.to_csv(
        r"C:\Disk_Dev\Repository\github-KG\github-KG-python\tx_data\resource\paperswithcode\sorted_topic_set_hasdfm.csv",
        encoding='utf-8')
    return sorted_topic_set
Exemple #19
0
def jsonpath_not_exist_key():
    json_dic = read_json_file(
        r"C:\Disk_Data\Small_Data\Neo4j\tensorflow-$-tensorflow.json")
    error = jsonpath.jsonpath(json_dic, "$.errors")
    return error
Exemple #20
0
def add_recommender_attr_and_label(reco_result_json_file, package_degree_csv,
                                   **kwargs):
    """
    数据集全作为测试集
    :param package_degree_csv:
    :param reco_result_json_file:
    :param kwargs:
    :return:
    """
    # if len(kwargs.get('test_ks')) != kwargs.get('split_M'):
    #     print('数据集应全作为测试集')
    #     exit(0)
    if kwargs.get('topN') < 1000:
        print('topN param error')
        exit(0)
    # train_row_list, test_row_list, train_dic, test_dic = split_train_test_dic(**kwargs)
    # total_test_repo_set = test_dic.keys()
    # payload_repo_set = sorted(total_test_repo_set)
    init_train_test_dic = split_portrait_train_test_by_count(**kwargs)
    vali_repo_list = init_train_test_dic.keys()
    payload_repo_set = set()
    for nameWithOwner in vali_repo_list:
        # 有训练集 且 有测试集的仓库取交集,也就是保留有测试集的仓库
        if len(init_train_test_dic[nameWithOwner]['test']['package']) > 0:
            payload_repo_set.add(nameWithOwner)
    payload_repo_set = sorted(payload_repo_set)
    reco_result_json_dic = read_json_file(reco_result_json_file)
    package_degree_records = pd.read_csv(package_degree_csv).to_dict(
        orient='records')
    package_degree_dic = {}
    for dic in package_degree_records:
        package_degree_dic[dic['nameWithManager']] = dic['repoDegree']
    add_label_result_json_dic = OrderedDict()
    # 对测试集中每个repo核对推荐结果
    for index, nameWithOwner in enumerate(payload_repo_set):
        print('-' * 100)
        print("index: " + str(index + 1) + "/" + str(len(payload_repo_set)) +
              ", repo: " + str(nameWithOwner))
        # copy一份原结果,在copy结果上修改、增加
        add_label_result_json_dic[nameWithOwner] = reco_result_json_dic[
            nameWithOwner]
        # test_package_set = set(test_dic[nameWithOwner])
        test_package_set = set(
            init_train_test_dic[nameWithOwner]['test']['package'])
        train_package_set = set([
            dic['key']
            for dic in init_train_test_dic[nameWithOwner]['train']['package']
        ])
        reco_package_set = set()
        # 依次检查 test_package_set 每个 package 是否命中
        # reco_result_package_record_list = reco_result_json_dic[nameWithOwner]
        reco_package_record_list = add_label_result_json_dic[nameWithOwner][
            'package']
        for record in reco_package_record_list:
            # nameWithManager = record['nameWithManager']
            nameWithManager = record['key']
            reco_package_set.add(nameWithManager)
            # 标记流行度
            record['repoDegree'] = package_degree_dic[nameWithManager]
            # 如果命中(测试集里有),则标记hit为1,没命中则标记为0(是否hit正负例完全取决于测试集里有没有)
            if nameWithManager in test_package_set:
                # 可以做到修改原字典
                record['hit'] = 1
            else:
                record['hit'] = 0
        # 对于推荐列表中没有而在测试集中有的,新增记录,score记录为?,hit为 1
        # 所以:最终标记数据集是 推荐列表(精度)和 测试集(召回)的并集
        # not_in_reco_but_in_test_package_set = test_package_set - reco_package_set
        not_in_reco_and_train_but_in_test_package_set = test_package_set - train_package_set - reco_package_set
        # for package in not_in_reco_but_in_test_package_set:
        for package in not_in_reco_and_train_but_in_test_package_set:
            # reco_result_json_dic[nameWithOwner].append({'score': '?',
            reco_package_record_list.append({
                'score':
                '?',
                # 'nameWithManager': package,
                'key':
                package,
                'repoDegree':
                package_degree_dic[package],
                'hit':
                1
            })

    # dump 结果
    to_reco_csv_file_name = kwargs.get('to_reco_csv_file').split('\\')[-1:][0]
    # split_dir_name = 'split_' + str(kwargs.get('split_M')) + '_test_' + "_".join(map(lambda x: str(x), kwargs.get('test_ks')))
    out_dir = os.path.join(kwargs.get('out_dir'),
                           os.path.splitext(to_reco_csv_file_name)[0],
                           'init_' + str(kwargs.get('init_input_count')),
                           'iter_' + str(kwargs.get('interaction_iter_count')),
                           'top' + str(kwargs.get('topN')), 'label')
    dump_file_name = os.path.splitext(
        reco_result_json_file.split('\\')[-1:][0])[0] + '_label_' + str(
            uuid1()) + '.json'
    write_json_file(out_dir, os.path.join(out_dir, dump_file_name),
                    add_label_result_json_dic)
    pass