Exemple #1
0
def train_model(pro_name, version, first_model_config, second_model_config):
    document_collection_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(
        str(document_collection_path))
    processor = CodeDocPreprocessor()
    doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        processor, collection)

    sub_search_model_config = [
        (PathUtil.sim_model(pro_name=pro_name,
                            version=version,
                            model_type=first_model_config[0]),
         first_model_config[1], first_model_config[2], False),
        (PathUtil.sim_model(pro_name=pro_name,
                            version=version,
                            model_type=second_model_config[0]),
         second_model_config[1], second_model_config[2], True),
    ]

    compound_model_name = "compound_{base_model}+{extra_model}".format(
        base_model=first_model_config[0], extra_model=second_model_config[0])

    print("try to model compound model for %r" % compound_model_name)

    model_dir_path = PathUtil.sim_model(pro_name=pro_name,
                                        version=version,
                                        model_type=compound_model_name)

    model = CompoundSearchModel.train(
        model_dir_path=model_dir_path,
        doc_collection=doc_collection,
        sub_search_model_config=sub_search_model_config)

    return model_dir_path
Exemple #2
0
 def load_doc(project_name="android27", version="v1"):
     """
     project_name: jdk8 android27
     """
     document_collection_path = PathUtil.doc(pro_name=project_name,
                                             version=version)
     return MultiFieldDocumentCollection.load(document_collection_path)
Exemple #3
0
    def init(self, doc_collection):
        """
        init from a exist doc collection
        :param doc_collection: could be a str pointing the path to MultiFieldDocumentCollection. or A exist MultiFieldDocumentCollection obj.
        :return:
        """
        if doc_collection is None:
            raise Exception("init from None")
        if isinstance(doc_collection, MultiFieldDocumentCollection):
            self.doc_collection = doc_collection
        elif isinstance(doc_collection, Path):
            self.doc_collection = MultiFieldDocumentCollection.load(str(doc_collection))
        elif isinstance(doc_collection, str):
            self.doc_collection = MultiFieldDocumentCollection.load(doc_collection)
        else:
            self.doc_collection = None

        print("init complete")
Exemple #4
0
 def __init__(self, graph_data_path, dc_file_location, concepts_path,
              relations_path):
     self.graph: GraphData = GraphData.load(graph_data_path)
     self.doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
         dc_file_location)
     with open(concepts_path) as f:
         self.concepts_list = json.load(f)
     with open(relations_path) as f:
         self.relations_list = json.load(f)
     self.concept_2_node_id = {}
def build_doc(pro_name, version):
    input_doc_collection_path = PathUtil.doc(pro_name=pro_name,
                                             version=version)
    output_pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name,
                                                      version=version,
                                                      pre_way="code-pre")
    doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
        input_doc_collection_path)
    precess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        preprocessor=CodeDocPreprocessor(), doc_collection=doc_collection)
    precess_doc_collection.save(output_pre_doc_collection_path)
Exemple #6
0
def train_model(pro_name, version):
    document_collection_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(
        str(document_collection_path))
    processor = Preprocessor()
    doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        processor, collection)
    model_dir_path = PathUtil.sim_model(pro_name=pro_name,
                                        version=version,
                                        model_type="bm25")
    BM25Model.train(model_dir_path, doc_collection=doc_collection)
    return model_dir_path
    def build_pre_doc(self, input_doc_collection_path, output_pre_doc_collection_path, preprocessor=None):

        if preprocessor == None:
            preprocessor = CodeDocPreprocessor()

        print("stat preprocess doc - for %s %r " % (input_doc_collection_path, preprocessor))
        doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(input_doc_collection_path)
        precess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
            preprocessor=preprocessor, doc_collection=doc_collection)

        precess_doc_collection.save(output_pre_doc_collection_path)
        print("end preprocess doc - %r %r " % (output_pre_doc_collection_path, preprocessor))
Exemple #8
0
def train_avg_w2v_model(pro_name, version):
    doc_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(str(doc_path))
    processor = CodeDocPreprocessor()
    pre_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
        processor, collection)
    pre_doc_path = PathUtil.pre_doc(pro_name, version, pre_way="code-pre")
    pre_doc_collection.save(pre_doc_path)
    word2vec_model_path = PathUtil.sim_model(pro_name=pro_name,
                                             version=version,
                                             model_type="avg_w2v")
    AVGW2VFLModel.train(model_dir_path=word2vec_model_path,
                        doc_collection=pre_doc_collection)
    return word2vec_model_path
 def __init__(self, pro_name, version):
     self.model_dir_path = PathUtil.sim_model(pro_name=pro_name,
                                              version=version,
                                              model_type="svm")
     self.model = FilterSemanticTFIDFNode2VectorModel(
         name="svm", model_dir_path=self.model_dir_path)
     self.document_collection_path = PathUtil.doc(pro_name, version)
     self.collection = MultiFieldDocumentCollection.load(
         str(self.document_collection_path))
     self.processor = Preprocessor()
     self.doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(
         self.processor, self.collection)
     self.pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name,
                                                     version=version,
                                                     weight="unweight")
     self.kg_name_searcher_path = PathUtil.name_searcher(pro_name, version)
     self.doc_sim_model_path = PathUtil.sim_model(pro_name=pro_name,
                                                  version=version,
                                                  model_type="avg_w2v")
Exemple #10
0
def train_model(pro_name, version, weight):
    document_collection_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(str(document_collection_path))
    processor = CodeDocPreprocessor()
    doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(processor, collection)

    graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version)

    pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name, version=version, weight=weight)

    embedding_size = 100

    kg_name_searcher_path = PathUtil.name_searcher(pro_name=pro_name, version=version)

    model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_n2v")
    model = AVGNode2VectorModel.train(model_dir_path=model_dir_path,
                                      doc_collection=doc_collection,
                                      embedding_size=embedding_size,
                                      pretrain_node2vec_path=pretrain_node2vec_path,
                                      graph_data_path=graph_data_path,
                                      kg_name_searcher_path=kg_name_searcher_path,
                                      )
    return model_dir_path
from sekg.graph.exporter.graph_data import NodeInfo
import json
from definitions import OUTPUT_DIR
from pathlib import Path

pro_name = 'jabref'
dc_file_location = PathUtil.doc(pro_name=pro_name, version='v1')
graph_data_file_location = PathUtil.graph_data(pro_name=pro_name,
                                               version='v1.8')
dc_file_destination = PathUtil.doc(pro_name=pro_name, version='v1.1')
comment_json_file = Path(OUTPUT_DIR) / "json" / "mid_2_dp_comment.json"
qualified_name_json_file = Path(
    OUTPUT_DIR) / "json" / "mid_2_qualified_name.json"

if __name__ == '__main__':
    doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
        dc_file_location)
    graph_data: GraphData = GraphData.load(graph_data_file_location)

    comment_list = []
    comments = open(comment_json_file, 'r').readlines()
    for line in comments:
        comment_list.append(json.loads(line))

    qualified_name_list = []
    names = open(qualified_name_json_file, 'r').readlines()
    for line in names:
        qualified_name_list.append(json.loads(line))

    missing_count = 0
    # 根据qualified name找到graph data对应节点的api_id, 然后通过这个api_id找到doc_collection中对应的doc, 插入field和相应信息
    for item in qualified_name_list:
Exemple #12
0
from flask_cors import CORS
from sekg.ir.doc.wrapper import MultiFieldDocumentCollection, MultiFieldDocument
from sekg.graph.exporter.graph_data import GraphData, NodeInfo

from project.knowledge_service import KnowledgeService
from project.doc_service import DocService
from project.json_service import JsonService
from project.utils.path_util import PathUtil

app = Flask(__name__)
cors = CORS(app, resources={r"/*": {"origins": "*"}})
pro_name = "jabref"
data_dir = PathUtil.doc(pro_name=pro_name, version="v1.2")
graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v1.8")
graph_data: GraphData = GraphData.load(graph_data_path)
doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
    data_dir)
knowledge_service = KnowledgeService(doc_collection)
doc_service = DocService()
json_service = JsonService()


@app.route('/')
def hello():
    return 'success'


# search doc info according to method name
@app.route('/get_doc/', methods=["GET", "POST"])
def doc_info():
    if "qualified_name" not in request.json:
        return "qualified name need"
'''
将样例代码进行聚类划分并输出
'''

pro_name = "jabref"
graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3.9")
doc_collection_path = PathUtil.doc(pro_name=pro_name, version="v3.2")
doc_collection_save_path = PathUtil.doc(pro_name=pro_name, version="v3.3")
api_to_example_json_path = Path(
    definitions.ROOT_DIR) / "output" / "json" / "api_2_example_sorted.json"
mid_to_method_info_json_path = Path(
    definitions.ROOT_DIR
) / "output" / "json" / "mid_2_method_info_without_comment.json"

graph_data: GraphData = GraphData.load(graph_data_path)
doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
    doc_collection_path)

# 读取sample code文件. api_to_mid: 每个api对应的sample code的mid. methods_info: 每个mid对应的代码
with open(api_to_example_json_path, 'r') as f:
    api_to_mid = json.load(f)
f.close()
methods_info = list()
methods = open(mid_to_method_info_json_path, 'r').readlines()
for method in methods:
    methods_info.append(json.loads(method)['method'])


# 根据qualified name查找得到doc文件
def find_doc(qualified_name):
    node: NodeInfo = graph_data.find_one_node_by_property(
        property_name='qualified_name', property_value=qualified_name)
Exemple #14
0
 def __init__(self):
     pro_name = "jabref"
     data_dir = PathUtil.doc(pro_name=pro_name, version="v3.3")
     self.doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load(
         data_dir)