def build_v2_1_graph_for_pro(pro_name): builder = CodeGraphBuilder() input_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v2") print(input_graph_data_path) output_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v2_1") domain_concept_output_dir = PathUtil.domain_concept_dir(pro_name=pro_name, version="v2") builder.build_v2_graph(pro_name=pro_name, input_graph_data_path=input_graph_data_path, output_graph_data_path=output_graph_data_path, domain_concept_output_dir=domain_concept_output_dir)
def build_v3_graph_for_pro(pro_name): builder = CodeGraphBuilder() input_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3") word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version="v3", model_type="avg_w2v") output_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3") generic_wikidata_item_cache_path = PathUtil.generic_wikidata_item_cache() wikidata_fusion_temp_result_dir = PathUtil.wikidata_fusion_temp_result_dir(pro_name) graph_data = builder.build_v3_graph_from_cache_simple(pro_name=pro_name, input_graph_data_path=input_graph_data_path, word2vec_model_path=word2vec_model_path, output_graph_data_path=output_graph_data_path, generic_title_search_cache_path=None, generic_wikidata_item_cache_path=generic_wikidata_item_cache_path, fusion_temp_result_dir=wikidata_fusion_temp_result_dir, ) graph_data.print_graph_info()
def train_name_searcher(pro_name, version): print("train graph name searcher for %s at version %s" % (pro_name, version)) name_searcher_path = PathUtil.name_searcher(pro_name=pro_name, version=version) graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) searcher = KGNameSearcher.train_from_graph_data_file(graph_data_path=graph_data_path, node_info_factory=ProjectKGNodeInfoFactory()) searcher.save(name_searcher_path) print("finish... save to %s" % name_searcher_path)
def build_v2_graph_for_pro(pro_name): graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v1") graph_data: GraphData = GraphData.load(graph_data_path) new_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v2") res = ExtractResultImport(graph_data, new_graph_data_path, 2) data_dir = Path(OUTPUT_DIR) / "graph" / "jdk8" / "filter_data" data_dir.mkdir(parents=True, exist_ok=True) filter_sentence_path = str(data_dir / "filter_sentence.txt") pat = re.compile('<[^>]+>', re.S) print("start to add sentences...") for id in graph_data.get_node_ids(): node_info = graph_data.get_node_info_dict(id) short_description = node_info["properties"].get( "short_description", "") if not short_description: continue short_description = pat.sub('', short_description) short_descs = sent_tokenize(short_description) for short_desc in short_descs: short_desc = " ".join(short_desc.split()) str_rm_sign = classifier.preprocessor.remove_sign(short_desc) text = classifier.preprocessor.remove_stop_words(str_rm_sign) label = list(classifier.predict(text))[0] if label == "0": print(short_desc) with open(filter_sentence_path, "a", encoding='utf-8') as f: f.write(short_desc) f.write("\n") continue else: res.add_sentence_relation(short_desc, id, int(label)) res.save_new_graph_data()
def train_model(pro_name, version, weight): document_collection_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load(str(document_collection_path)) processor = CodeDocPreprocessor() doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(processor, collection) graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name, version=version, weight=weight) embedding_size = 100 kg_name_searcher_path = PathUtil.name_searcher(pro_name=pro_name, version=version) model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_n2v") model = AVGNode2VectorModel.train(model_dir_path=model_dir_path, doc_collection=doc_collection, embedding_size=embedding_size, pretrain_node2vec_path=pretrain_node2vec_path, graph_data_path=graph_data_path, kg_name_searcher_path=kg_name_searcher_path, ) return model_dir_path
def __init__(self, pro_name, version, model_dir): graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) self.graph_data: GraphData = GraphData.load(graph_data_path) self.model = self.create_search_model(pro_name, version, model_dir) print("It's ok for init!")
from sekg.graph.exporter.graph_data import GraphData from sekg.ir.models.n2v.svm.avg_n2v import AVGNode2VectorModel from util.path_util import PathUtil if __name__ == '__main__': pro_name = "jdk8" version = "v3" model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_n2v") model = AVGNode2VectorModel.load(model_dir_path) graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) graph_data: GraphData = GraphData.load(graph_data_path) valid_class_ids = graph_data.get_node_ids_by_label("class") valid_class_ids = valid_class_ids - graph_data.get_node_ids_by_label( "class type") valid_method_ids = graph_data.get_node_ids_by_label("method") valid_method_ids.update( graph_data.get_node_ids_by_label("base override method")) valid_sentence_ids = graph_data.get_node_ids_by_label("sentence") while True: query = input("please input query: ") select = int(input("1、class; 2、methos; 3、sentence")) top_num = int(input("please input top num")) result = [] if select == 1: result = model.search(query=query, top_num=top_num, valid_doc_id_set=valid_class_ids) elif select == 2:
def build_doc(pro_name, version): graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) document_collection_path = PathUtil.doc(pro_name=pro_name, version=version) builder = CodeGraphBuilder() builder.build_doc(graph_data_path=graph_data_path, output_doc_collection_path=document_collection_path)
from sekg.graph.exporter.graph_data import GraphData from sekg.ir.models.n2v.svm.filter_semantic_tfidf_n2v import FilterSemanticTFIDFNode2VectorModel from util.path_util import PathUtil if __name__ == '__main__': model_dir_path = PathUtil.sim_model(pro_name="jdk8", version="v3", model_type="svm") model = FilterSemanticTFIDFNode2VectorModel.load(model_dir_path) graph_data_path = PathUtil.graph_data(pro_name="jdk8", version="v3") graph_data: GraphData = GraphData.load(graph_data_path) valid_class_ids = graph_data.get_node_ids_by_label("class") valid_class_ids = valid_class_ids - graph_data.get_node_ids_by_label( "class type") valid_method_ids = graph_data.get_node_ids_by_label("method") valid_method_ids.update( graph_data.get_node_ids_by_label("base override method")) valid_sentence_ids = graph_data.get_node_ids_by_label("sentence") while True: query = input("please input query: ") select = int(input("1、class; 2、methos; 3、sentence")) top_num = int(input("please input top num")) result = [] if select == 1: result = model.search(query=query, top_num=top_num, valid_doc_id_set=valid_class_ids) elif select == 2: result = model.search(query=query, top_num=top_num,