def train_model(pro_name, version, first_model_config, second_model_config): document_collection_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load( str(document_collection_path)) processor = CodeDocPreprocessor() doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( processor, collection) sub_search_model_config = [ (PathUtil.sim_model(pro_name=pro_name, version=version, model_type=first_model_config[0]), first_model_config[1], first_model_config[2], False), (PathUtil.sim_model(pro_name=pro_name, version=version, model_type=second_model_config[0]), second_model_config[1], second_model_config[2], True), ] compound_model_name = "compound_{base_model}+{extra_model}".format( base_model=first_model_config[0], extra_model=second_model_config[0]) print("try to model compound model for %r" % compound_model_name) model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type=compound_model_name) model = CompoundSearchModel.train( model_dir_path=model_dir_path, doc_collection=doc_collection, sub_search_model_config=sub_search_model_config) return model_dir_path
def create_search_model(pro_name, version, model_dir): sub_search_model_config_path = model_dir / "submodel.config" with open(sub_search_model_config_path, 'rb') as aq: sub_search_model_config = pickle.loads(aq.read()) model_1 = PathUtil.sim_model(pro_name, version, "avg_w2v") model_2 = PathUtil.sim_model(pro_name, version, "svm") new_sub_search_model_config = [ (model_1, sub_search_model_config[0][1], sub_search_model_config[0][2], sub_search_model_config[0][3]), (model_2, sub_search_model_config[1][1], sub_search_model_config[1][2], sub_search_model_config[1][3]), ] with open(sub_search_model_config_path, 'wb') as out: out.write(pickle.dumps(new_sub_search_model_config)) model = CompoundSearchModel.load(model_dir) return model
def __init__(self, pro_name, version): self.model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="svm") self.model = FilterSemanticTFIDFNode2VectorModel( name="svm", model_dir_path=self.model_dir_path) self.document_collection_path = PathUtil.doc(pro_name, version) self.collection = MultiFieldDocumentCollection.load( str(self.document_collection_path)) self.processor = Preprocessor() self.doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( self.processor, self.collection) self.pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name, version=version, weight="unweight") self.kg_name_searcher_path = PathUtil.name_searcher(pro_name, version) self.doc_sim_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v")
def train_model(pro_name, version): document_collection_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load( str(document_collection_path)) processor = Preprocessor() doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( processor, collection) model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="bm25") BM25Model.train(model_dir_path, doc_collection=doc_collection) return model_dir_path
def train_avg_w2v_model(pro_name, version): doc_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load(str(doc_path)) processor = CodeDocPreprocessor() pre_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( processor, collection) pre_doc_path = PathUtil.pre_doc(pro_name, version, pre_way="code-pre") pre_doc_collection.save(pre_doc_path) word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v") AVGW2VFLModel.train(model_dir_path=word2vec_model_path, doc_collection=pre_doc_collection) return word2vec_model_path
def build_extra_model_and_doc(pro_name, version_list): for version in version_list: preprocessors = [CodeDocPreprocessor()] pre_way = "code-pre" build_doc(pro_name=pro_name, version=version) for preprocessor in preprocessors: build_pre_doc(pro_name=pro_name, version=version, preprocessor=preprocessor) train_name_searcher(pro_name=pro_name, version=version) pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way=pre_way) preprocess_doc_collection: PreprocessMultiFieldDocumentCollection = PreprocessMultiFieldDocumentCollection.load( pre_doc_collection_path) word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v") AVGW2VFLModel.train(model_dir_path=word2vec_model_path, doc_collection=preprocess_doc_collection)
def build_v3_graph_for_pro(pro_name): builder = CodeGraphBuilder() input_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3") word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version="v3", model_type="avg_w2v") output_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3") generic_wikidata_item_cache_path = PathUtil.generic_wikidata_item_cache() wikidata_fusion_temp_result_dir = PathUtil.wikidata_fusion_temp_result_dir(pro_name) graph_data = builder.build_v3_graph_from_cache_simple(pro_name=pro_name, input_graph_data_path=input_graph_data_path, word2vec_model_path=word2vec_model_path, output_graph_data_path=output_graph_data_path, generic_title_search_cache_path=None, generic_wikidata_item_cache_path=generic_wikidata_item_cache_path, fusion_temp_result_dir=wikidata_fusion_temp_result_dir, ) graph_data.print_graph_info()
def train_model(pro_name, version, weight): document_collection_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load(str(document_collection_path)) processor = CodeDocPreprocessor() doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(processor, collection) graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name, version=version, weight=weight) embedding_size = 100 kg_name_searcher_path = PathUtil.name_searcher(pro_name=pro_name, version=version) model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_n2v") model = AVGNode2VectorModel.train(model_dir_path=model_dir_path, doc_collection=doc_collection, embedding_size=embedding_size, pretrain_node2vec_path=pretrain_node2vec_path, graph_data_path=graph_data_path, kg_name_searcher_path=kg_name_searcher_path, ) return model_dir_path
from script.summary.generate_summary import Summary from util.path_util import PathUtil if __name__ == '__main__': pro_name = "jdk8" version = "v3_1" compound_model_name = "compound_{base_model}+{extra_model}".format( base_model="avg_w2v", extra_model="svm") model_dir = PathUtil.sim_model(pro_name=pro_name, version=version, model_type=compound_model_name) summary = Summary(pro_name, version, model_dir) while True: query = input("please input query:") class_name = input("please input qualified class name") all_class_2_summary = summary.get_summary(query, class_name) for index, item in all_class_2_summary.items(): print(index, item)
from sekg.graph.exporter.graph_data import GraphData from sekg.ir.models.n2v.svm.avg_n2v import AVGNode2VectorModel from util.path_util import PathUtil if __name__ == '__main__': pro_name = "jdk8" version = "v3" model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_n2v") model = AVGNode2VectorModel.load(model_dir_path) graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) graph_data: GraphData = GraphData.load(graph_data_path) valid_class_ids = graph_data.get_node_ids_by_label("class") valid_class_ids = valid_class_ids - graph_data.get_node_ids_by_label( "class type") valid_method_ids = graph_data.get_node_ids_by_label("method") valid_method_ids.update( graph_data.get_node_ids_by_label("base override method")) valid_sentence_ids = graph_data.get_node_ids_by_label("sentence") while True: query = input("please input query: ") select = int(input("1、class; 2、methos; 3、sentence")) top_num = int(input("please input top num")) result = [] if select == 1: result = model.search(query=query, top_num=top_num, valid_doc_id_set=valid_class_ids) elif select == 2:
from sekg.graph.exporter.graph_data import GraphData from sekg.ir.models.n2v.svm.filter_semantic_tfidf_n2v import FilterSemanticTFIDFNode2VectorModel from util.path_util import PathUtil if __name__ == '__main__': model_dir_path = PathUtil.sim_model(pro_name="jdk8", version="v3", model_type="svm") model = FilterSemanticTFIDFNode2VectorModel.load(model_dir_path) graph_data_path = PathUtil.graph_data(pro_name="jdk8", version="v3") graph_data: GraphData = GraphData.load(graph_data_path) valid_class_ids = graph_data.get_node_ids_by_label("class") valid_class_ids = valid_class_ids - graph_data.get_node_ids_by_label( "class type") valid_method_ids = graph_data.get_node_ids_by_label("method") valid_method_ids.update( graph_data.get_node_ids_by_label("base override method")) valid_sentence_ids = graph_data.get_node_ids_by_label("sentence") while True: query = input("please input query: ") select = int(input("1、class; 2、methos; 3、sentence")) top_num = int(input("please input top num")) result = [] if select == 1: result = model.search(query=query, top_num=top_num, valid_doc_id_set=valid_class_ids) elif select == 2: result = model.search(query=query, top_num=top_num,
from sekg.graph.exporter.graph_data import GraphData from sekg.ir.models.avg_w2v import AVGW2VFLModel from sekg.ir.models.bm25 import BM25Model from util.path_util import PathUtil if __name__ == '__main__': pro_name = "jdk8" version = "v3_1" model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="bm25") model = BM25Model.load(model_dir_path) graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) graph_data: GraphData = GraphData.load(graph_data_path) valid_class_ids = graph_data.get_node_ids_by_label("class") valid_class_ids = valid_class_ids - graph_data.get_node_ids_by_label( "class type") valid_method_ids = graph_data.get_node_ids_by_label("method") valid_method_ids.update( graph_data.get_node_ids_by_label("base override method")) valid_sentence_ids = graph_data.get_node_ids_by_label("sentence") while True: query = input("please input query: ") select = int(input("1、class; 2、methos; 3、sentence")) top_num = int(input("please input top num")) result = [] if select == 1: result = model.search(query=query, top_num=top_num, valid_doc_id_set=valid_class_ids)