Exemple #1
0
def text2feat(api, api_descriptions, w2v, idf, query_matrix, query_idf_vector):
    api_matrix, api_idf_vector = feedback.load_matrix(api, w2v, idf)
    api_descriptions_matrix, api_descriptions_idf_vector = feedback.load_matrix(
        api_descriptions, w2v, idf)

    # 获取api及doc信息并计算其相似度,相关问题在推荐中已经获得
    api_sim = similarity.sim_doc_pair(query_matrix, api_matrix,
                                      query_idf_vector, api_idf_vector)
    if api_descriptions == 'null':
        api_desc_sim = 0
    else:
        api_desc_sim = similarity.sim_doc_pair(query_matrix,
                                               api_descriptions_matrix,
                                               query_idf_vector,
                                               api_descriptions_idf_vector)

    # 将获得信息按api为一列放入sum_inf中
    sum_inf = list()
    sum_inf.append(api_sim)
    sum_inf.append(api_desc_sim)

    # 将所有特征封装成字典并返回,这样得到特征之后能直接输出topn的相关特征
    api_inf = dict()
    api_desc_inf = dict()
    api_inf[api] = api_sim
    api_desc_inf[api_descriptions] = api_desc_sim

    return sum_inf, api_inf, api_desc_inf
def get_sim_query(train, test, w2v, idf):
    sim = 0
    for i in range(len(train)):
        train_matrix, train_idf = load_matrix(train, w2v, idf)
        test_matrix, test_idf = load_matrix(test, w2v, idf)
        sim = similarity.sim_doc_pair(train_matrix, test_matrix, train_idf,
                                      test_idf)
    return sim
def get_topk_questions(origin_query, query_matrix, query_idf_vector, questions,
                       topk, parent):

    # this function returns a dictionary of the top-k most relevant questions of the query
    # the key is question id, the value is the similarity between the question and the query

    query_id = '-1'
    for question in questions:
        if question.title == origin_query or question.title in origin_query or origin_query in question.title:  # the same question should not appear in the dataset
            query_id = question.id
            if query_id not in parent:
                parent[query_id] = query_id

    relevant_questions = list()
    for question in questions:

        if query_id in parent and question.id in parent and parent[
                query_id] == parent[question.id]:  #duplicate questions
            continue

        valid = False
        for answer in question.answers:
            if int(answer.score) >= 0:
                valid = True
        if not valid:
            continue

        sim = similarity.sim_doc_pair(query_matrix, question.matrix,
                                      query_idf_vector, question.idf_vector)
        relevant_questions.append((question.id, question.title, sim))

    list_relevant_questions = sorted(relevant_questions,
                                     key=lambda question: question[2],
                                     reverse=True)

    #print(list_relevant_questions)
    # get the ids of top-k most relevant questions
    top_questions = dict()
    for i, item in enumerate(list_relevant_questions):
        top_questions[item[0]] = item[2]
        if i + 1 == topk:
            break

    return top_questions
def get_feedback_api(query, answer, query_matrix, query_idf_vector, w2v, idf):
    line = 0
    feeds = []
    for row in answer:
        if line > 0:
            question_matrix, question_idf_vector = load_matrix(
                query[answer.index(row)], w2v, idf)
            sim = similarity.sim_doc_pair(query_matrix, question_matrix,
                                          query_idf_vector,
                                          question_idf_vector)
            # 若query与反馈的问题相似,则将反馈问题的api信息加入
            if sim > 0.65:
                for n in range(len(row)):
                    feed = [query[answer.index(row)], row[n], sim]
                    feeds.append(feed)
        line += 1
    feeds = sorted(feeds, key=lambda item: item[2], reverse=True)
    while len(feeds) < 5:
        feeds.append([0, 0, 0])
    feed_sim = []
    for inf in feeds:
        if len(feed_sim) < 5:
            feed_sim.append(inf[2])
    return feeds, feed_sim
def recommend_api_class(query_matrix, query_idf_vector, top_questions,
                        questions, javadoc, javadoc_dict_classes, topk):
    # remember that top_questions is a dictionary of the top-k most relevant questions of the query
    # the key is question id, the value is the similarity between the question and the query
    # questions is a list including all questions (api related) in StackOverflow
    # javadoc is a list including all api classes

    api_classes_count = dict()
    api_classes = dict(
    )  # stores the similarity between the question (whose answer contains the API class) and the query

    for question in questions:
        if question.id not in top_questions:
            continue
        for answer in question.answers:
            if int(answer.score) < 0:
                continue

            soup = BeautifulSoup(answer.body,
                                 'html.parser',
                                 from_encoding='utf-8')

            links = soup.find_all('a')
            for link in links:
                link = link['href']
                if 'docs.oracle.com/javase/' in link and '/api/' in link and 'html' in link:
                    pair = util.parse_api_link(
                        link)  # pair[0] is class name, pair[1] is method name
                    class_name = pair[
                        0]  #note that this class_name already contains package name, i.e, java.util.Calendar
                    if class_name in api_classes:
                        api_classes[class_name] += top_questions[question.id]
                        api_classes_count[class_name] += 1
                    else:
                        api_classes[class_name] = top_questions[question.id]
                        api_classes_count[class_name] = 1

            codes = soup.find_all('code')
            for code in codes:
                code = code.get_text()
                pos = code.find('(')
                if pos != -1:
                    code = code[:pos]
                #code = code.replace('()', '')
                if code in javadoc_dict_classes:
                    # print code,'!class'
                    class_name = javadoc_dict_classes[code]
                    if class_name in api_classes:
                        api_classes[class_name] += top_questions[question.id]
                        api_classes_count[class_name] += 1
                    else:
                        api_classes[class_name] = top_questions[question.id]
                        api_classes_count[class_name] = 1

    for key, value in api_classes.items():
        api_classes[key] = min(
            1.0, value / api_classes_count[key] *
            (1.0 + math.log(api_classes_count[key], 2) / 10))

    api_sim = {}

    for api in javadoc:
        if api.package_name + '.' + api.class_name not in api_classes:
            continue

        doc_sim = 0.0

        for i, method_matrix in enumerate(api.methods_matrix):
            doc_sim = max(
                doc_sim,
                similarity.sim_doc_pair(query_matrix, method_matrix,
                                        query_idf_vector,
                                        api.methods_idf_vector[i]))

        so_sim = api_classes[api.package_name + '.' + api.class_name]

        api_sim[api.package_name + '.' +
                api.class_name] = 2 * doc_sim * so_sim / (doc_sim + so_sim)

    api_sim = sorted(api_sim.items(), key=lambda item: item[1], reverse=True)

    recommended_api = list()

    for item in api_sim:
        recommended_api.append(item[0])
        if topk != -1 and len(recommended_api) >= topk:
            break

    return recommended_api
def recommend_api(query_matrix, query_idf_vector, top_questions, questions,
                  javadoc, javadoc_dict_methods, topk):
    # remember that top_questions is a dictionary of the top-k most relevant questions of the query
    # the key is question id, the value is the similarity between the question and the query
    # questions is a list including all questions (api related) in StackOverflow
    # javadoc is a list including all api classes

    api_methods = dict()  #stores the SO_sim of api method and the query
    api_methods_count = dict()

    for question in questions:
        if question.id not in top_questions:
            continue

        tmp_set = set()

        for answer in question.answers:

            if int(answer.score) < 0:
                continue

            soup = BeautifulSoup(answer.body, 'html.parser')
            links = soup.find_all('a')
            for link in links:
                link = link['href']
                if 'docs.oracle.com/javase/' in link and '/api/' in link and 'html' in link:
                    pair = util.parse_api_link(
                        link)  # pair[0] is class name, pair[1] is method name

                    if pair[1] != '':
                        method_name = pair[0] + '.' + pair[1]
                        if method_name in tmp_set:
                            continue
                        else:
                            tmp_set.add(method_name)
                            if method_name in api_methods:
                                api_methods[method_name] += top_questions[
                                    question.id]
                                api_methods_count[method_name] += 1
                            else:
                                api_methods[method_name] = top_questions[
                                    question.id]
                                api_methods_count[method_name] = 1.0

            codes = soup.find_all('code')
            for code in codes:
                code = code.get_text()
                pos = code.find('(')
                if pos != -1:
                    code = code[:pos]

                if code in javadoc_dict_methods:
                    method_name = javadoc_dict_methods[code]
                    if method_name in tmp_set:
                        continue
                    else:
                        tmp_set.add(method_name)
                        if method_name in api_methods:
                            api_methods[method_name] += top_questions[
                                question.id]
                            api_methods_count[method_name] += 1
                        else:
                            api_methods[method_name] = top_questions[
                                question.id]
                            api_methods_count[method_name] = 1.0

    for key, value in api_methods.items():
        api_methods[key] = min(
            1.0, value / api_methods_count[key] *
            (1.0 + math.log(api_methods_count[key], 2) / 10))

    api_sim = {}

    for api in javadoc:
        class_name = api.package_name + '.' + api.class_name

        for i, method in enumerate(api.methods):

            method_name = class_name + '.' + method

            if method_name not in api_methods:
                continue
            else:
                doc_sim = similarity.sim_doc_pair(query_matrix,
                                                  api.methods_matrix[i],
                                                  query_idf_vector,
                                                  api.methods_idf_vector[i])
                so_sim = api_methods[method_name]

                if method_name in api_sim:
                    api_sim[method_name] = max(
                        api_sim[method_name],
                        2 * doc_sim * so_sim / (doc_sim + so_sim))
                else:
                    api_sim[method_name] = 2 * doc_sim * so_sim / (doc_sim +
                                                                   so_sim)

    api_sim = sorted(api_sim.items(), key=lambda item: item[1], reverse=True)

    recommended_api = list()

    for item in api_sim:
        recommended_api.append(item[0])

        if topk != -1 and len(recommended_api) >= topk:
            break

    return recommended_api