def build_v2_1_graph_for_pro(pro_name):
    builder = CodeGraphBuilder()
    input_graph_data_path = PathUtil.graph_data(pro_name=pro_name,
                                                version="v2")
    print(input_graph_data_path)
    output_graph_data_path = PathUtil.graph_data(pro_name=pro_name,
                                                 version="v2_1")
    domain_concept_output_dir = PathUtil.domain_concept_dir(pro_name=pro_name,
                                                            version="v2")
    builder.build_v2_graph(pro_name=pro_name,
                           input_graph_data_path=input_graph_data_path,
                           output_graph_data_path=output_graph_data_path,
                           domain_concept_output_dir=domain_concept_output_dir)
Esempio n. 2
0
def build_v3_graph_for_pro(pro_name):
    builder = CodeGraphBuilder()
    input_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3")

    word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version="v3", model_type="avg_w2v")
    output_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3")
    generic_wikidata_item_cache_path = PathUtil.generic_wikidata_item_cache()
    wikidata_fusion_temp_result_dir = PathUtil.wikidata_fusion_temp_result_dir(pro_name)

    graph_data = builder.build_v3_graph_from_cache_simple(pro_name=pro_name,
                                                          input_graph_data_path=input_graph_data_path,
                                                          word2vec_model_path=word2vec_model_path,
                                                          output_graph_data_path=output_graph_data_path,
                                                          generic_title_search_cache_path=None,
                                                          generic_wikidata_item_cache_path=generic_wikidata_item_cache_path,
                                                          fusion_temp_result_dir=wikidata_fusion_temp_result_dir,
                                                          )
    graph_data.print_graph_info()
Esempio n. 3
0
def train_name_searcher(pro_name, version):
    print("train graph name searcher for %s at version %s" % (pro_name, version))
    name_searcher_path = PathUtil.name_searcher(pro_name=pro_name, version=version)

    graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version)

    searcher = KGNameSearcher.train_from_graph_data_file(graph_data_path=graph_data_path,
                                                         node_info_factory=ProjectKGNodeInfoFactory())
    searcher.save(name_searcher_path)
    print("finish... save to %s" % name_searcher_path)
def build_v2_graph_for_pro(pro_name):

    graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v1")
    graph_data: GraphData = GraphData.load(graph_data_path)
    new_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v2")
    res = ExtractResultImport(graph_data, new_graph_data_path, 2)

    data_dir = Path(OUTPUT_DIR) / "graph" / "jdk8" / "filter_data"
    data_dir.mkdir(parents=True, exist_ok=True)
    filter_sentence_path = str(data_dir / "filter_sentence.txt")

    pat = re.compile('<[^>]+>', re.S)

    print("start to add sentences...")
    for id in graph_data.get_node_ids():
        node_info = graph_data.get_node_info_dict(id)
        short_description = node_info["properties"].get(
            "short_description", "")
        if not short_description:
            continue

        short_description = pat.sub('', short_description)
        short_descs = sent_tokenize(short_description)

        for short_desc in short_descs:
            short_desc = " ".join(short_desc.split())
            str_rm_sign = classifier.preprocessor.remove_sign(short_desc)
            text = classifier.preprocessor.remove_stop_words(str_rm_sign)
            label = list(classifier.predict(text))[0]
            if label == "0":
                print(short_desc)
                with open(filter_sentence_path, "a", encoding='utf-8') as f:
                    f.write(short_desc)
                    f.write("\n")
                continue
            else:
                res.add_sentence_relation(short_desc, id, int(label))
    res.save_new_graph_data()
Esempio n. 5
0
def train_model(pro_name, version, weight):
    document_collection_path = PathUtil.doc(pro_name, version)
    collection = MultiFieldDocumentCollection.load(str(document_collection_path))
    processor = CodeDocPreprocessor()
    doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(processor, collection)

    graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version)

    pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name, version=version, weight=weight)

    embedding_size = 100

    kg_name_searcher_path = PathUtil.name_searcher(pro_name=pro_name, version=version)

    model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_n2v")
    model = AVGNode2VectorModel.train(model_dir_path=model_dir_path,
                                      doc_collection=doc_collection,
                                      embedding_size=embedding_size,
                                      pretrain_node2vec_path=pretrain_node2vec_path,
                                      graph_data_path=graph_data_path,
                                      kg_name_searcher_path=kg_name_searcher_path,
                                      )
    return model_dir_path
 def __init__(self, pro_name, version, model_dir):
     graph_data_path = PathUtil.graph_data(pro_name=pro_name,
                                           version=version)
     self.graph_data: GraphData = GraphData.load(graph_data_path)
     self.model = self.create_search_model(pro_name, version, model_dir)
     print("It's ok for init!")
Esempio n. 7
0
from sekg.graph.exporter.graph_data import GraphData
from sekg.ir.models.n2v.svm.avg_n2v import AVGNode2VectorModel

from util.path_util import PathUtil

if __name__ == '__main__':
    pro_name = "jdk8"
    version = "v3"
    model_dir_path = PathUtil.sim_model(pro_name=pro_name,
                                        version=version,
                                        model_type="avg_n2v")
    model = AVGNode2VectorModel.load(model_dir_path)
    graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version)
    graph_data: GraphData = GraphData.load(graph_data_path)
    valid_class_ids = graph_data.get_node_ids_by_label("class")
    valid_class_ids = valid_class_ids - graph_data.get_node_ids_by_label(
        "class type")
    valid_method_ids = graph_data.get_node_ids_by_label("method")
    valid_method_ids.update(
        graph_data.get_node_ids_by_label("base override method"))
    valid_sentence_ids = graph_data.get_node_ids_by_label("sentence")
    while True:
        query = input("please input query: ")
        select = int(input("1、class; 2、methos; 3、sentence"))
        top_num = int(input("please input top num"))
        result = []
        if select == 1:
            result = model.search(query=query,
                                  top_num=top_num,
                                  valid_doc_id_set=valid_class_ids)
        elif select == 2:
Esempio n. 8
0
def build_doc(pro_name, version):
    graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version)
    document_collection_path = PathUtil.doc(pro_name=pro_name, version=version)

    builder = CodeGraphBuilder()
    builder.build_doc(graph_data_path=graph_data_path, output_doc_collection_path=document_collection_path)
Esempio n. 9
0
from sekg.graph.exporter.graph_data import GraphData
from sekg.ir.models.n2v.svm.filter_semantic_tfidf_n2v import FilterSemanticTFIDFNode2VectorModel

from util.path_util import PathUtil

if __name__ == '__main__':
    model_dir_path = PathUtil.sim_model(pro_name="jdk8",
                                        version="v3",
                                        model_type="svm")
    model = FilterSemanticTFIDFNode2VectorModel.load(model_dir_path)
    graph_data_path = PathUtil.graph_data(pro_name="jdk8", version="v3")
    graph_data: GraphData = GraphData.load(graph_data_path)
    valid_class_ids = graph_data.get_node_ids_by_label("class")
    valid_class_ids = valid_class_ids - graph_data.get_node_ids_by_label(
        "class type")
    valid_method_ids = graph_data.get_node_ids_by_label("method")
    valid_method_ids.update(
        graph_data.get_node_ids_by_label("base override method"))
    valid_sentence_ids = graph_data.get_node_ids_by_label("sentence")
    while True:
        query = input("please input query: ")
        select = int(input("1、class; 2、methos; 3、sentence"))
        top_num = int(input("please input top num"))
        result = []
        if select == 1:
            result = model.search(query=query,
                                  top_num=top_num,
                                  valid_doc_id_set=valid_class_ids)
        elif select == 2:
            result = model.search(query=query,
                                  top_num=top_num,