def __init__(self, filter_score=DEFAULT_FILTER_CONTEXT_SCORE, proxy_server=DEFAULT_PROXY_SERVER): self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) self.wikipedia_cache = {} self.fetcher = AsyncWikiSearcher(proxy_server) self.graph_data = GraphData() self.wikidata_property_table = WikiDataPropertyTable.get_instance() self.embedding = {} self.filter_score = filter_score self.NLP = SpacyNLPFactory.create_simple_nlp_pipeline() self.all_domain_vector = {}
def __init__(self, graph_data): if isinstance(graph_data, GraphData): self.graph_data = graph_data elif isinstance(graph_data, Path): self.graph_data = GraphData.load(str(graph_data)) elif isinstance(graph_data, str): self.graph_data = GraphData.load(graph_data) else: self.graph_data = None self.graph_data_reader = GraphDataReader(graph_data=self.graph_data, node_info_factory=ProjectKGNodeInfoFactory()) self.doc_collection = MultiFieldDocumentCollection()
def load_graph_data(is_jdk=True, version="v1"): if is_jdk: graph_data_path = PathUtil.jdk_graph_data(version) else: graph_data_path = PathUtil.android_graph_data(version) return GraphData.load(graph_data_path)
def kg_impoter(path): graph_client = GRAPH_FACTORY.create_py2neo_graph_by_server_name( server_name="SOSampleCodeKG") accessor = GraphAccessor(graph_client) importer = Neo4jImporter(accessor) # graph_data_path = str(Path(GRAPH_DATA_DIR) / 'jdk8.v5.graph') graph_data: GraphData = GraphData.load(path) importer.import_all_graph_data(graph_data)
def __init__(self, doc_collection): graph_data_path = PathUtil.graph_data(pro_name="jabref", version="v3.7") self.graph_data = GraphData.load(graph_data_path) self.doc_collection = doc_collection self.entity_words = set() self.entity_2_score = dict() self.counter = 0 self.entity_path = str(Path(OUTPUT_DIR) / "entity.json")
def __init__(self, graph_data_path, dc_file_location, concepts_path, relations_path): self.graph: GraphData = GraphData.load(graph_data_path) self.doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( dc_file_location) with open(concepts_path) as f: self.concepts_list = json.load(f) with open(relations_path) as f: self.relations_list = json.load(f) self.concept_2_node_id = {}
def __init__(self, doc_collection, graph_data_path=PathUtil.graph_data(pro_name="jabref", version="v3.10")): if isinstance(graph_data_path, GraphData): self.graph_data: GraphData = graph_data_path else: self.graph_data: GraphData = GraphData.load(graph_data_path) self.doc_collection = doc_collection self.functionClassifier = FastTextClassifier() self.G = nx.Graph(self.graph_data.graph)
def train_weight_graph_data(graph_data_output_dir, node2vec_output_dir, pro_name, version): graph_random_walk_path = str( node2vec_output_dir / "{pro}.{version}.weight.rwp".format(pro=pro_name, version=version)) trainer = GraphNode2VecTrainer( GraphData.load( str(graph_data_output_dir / ("{pro}.{version}.graph".format( pro=pro_name, version=version))))) trainer.init_weight_graph(weight=True) trainer.generate_random_path(rw_path_store_path=graph_random_walk_path) graph2vec_model_path = str(node2vec_output_dir / "{pro}.{version}.weight.node2vec".format( pro=pro_name, version=version)) GraphNode2VecTrainer.train(rw_path_store_path=graph_random_walk_path, model_path=graph2vec_model_path, dimensions=100)
def train_node2vec(pro_name, version): print("train node2vec for %s at version %s" % (pro_name, version)) graph_data_output_dir = Path(OUTPUT_DIR) / "graph" / pro_name graph_data_output_dir.mkdir(exist_ok=True, parents=True) node2vec_output_dir = graph_data_output_dir / "GraphEmbedding" node2vec_output_dir.mkdir(exist_ok=True, parents=True) graph_random_walk_path = str( node2vec_output_dir / "{pro}.{version}.unweight.rwp".format(pro=pro_name, version=version)) trainer = GraphNode2VecTrainer( GraphData.load( str(graph_data_output_dir / ("{pro}.{version}.graph".format( pro=pro_name, version=version))))) trainer.init_unweight_graph() trainer.generate_random_path(rw_path_store_path=graph_random_walk_path) graph2vec_model_path = str(node2vec_output_dir / "{pro}.{version}.unweight.node2vec".format( pro=pro_name, version=version)) GraphNode2VecTrainer.train(rw_path_store_path=graph_random_walk_path, model_path=graph2vec_model_path, dimensions=100)
def __init__(self, input_graph_version): self.save_expand_res_path = str( Path(OUTPUT_DIR) / "prefix_suffix_relations.pickle") self.api_id_2_record_text_path = str( Path(OUTPUT_DIR) / "api_id_2_record.pickle") self.api_id_2_record_text = Tool.load_pickle( self.api_id_2_record_text_path) graph_data_path = PathUtil.graph_data(pro_name="jabref", version=input_graph_version) self.graph_data = GraphData.load(graph_data_path) self.func_relation_set = { RelationNameConstant.has_Functionality_Relation, RelationNameConstant.Functionality_Compare_Relation, RelationNameConstant.has_Behavior_Relation, } self.concept_classification = { RelationNameConstant.Ontology_IS_A_Relation, } self.membership = { RelationNameConstant.Ontology_Derive_Relation, } self.characteristic = { RelationNameConstant.has_Feature_Relation, RelationNameConstant.has_Constraint_Relation, } self.category_name_2_id = dict() self.type_of_class = { CodeEntityCategory.CATEGORY_CLASS, CodeEntityCategory.CATEGORY_INTERFACE, CodeEntityCategory.CATEGORY_EXCEPTION_CLASS, CodeEntityCategory.CATEGORY_ERROR_CLASS, CodeEntityCategory.CATEGORY_ENUM_CLASS, CodeEntityCategory.CATEGORY_ANNOTATION_CLASS } self.type_of_method = { CodeEntityCategory.CATEGORY_METHOD, CodeEntityCategory.CATEGORY_CONSTRUCT_METHOD, CodeEntityCategory.CATEGORY_BASE_OVERRIDE_METHOD, } self.CODE_NAME_UTIL = CodeElementNameUtil()
def build_v2_graph_for_pro(pro_name): graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v1") graph_data: GraphData = GraphData.load(graph_data_path) new_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v2") res = ExtractResultImport(graph_data, new_graph_data_path, 2) data_dir = Path(OUTPUT_DIR) / "graph" / "jdk8" / "filter_data" data_dir.mkdir(parents=True, exist_ok=True) filter_sentence_path = str(data_dir / "filter_sentence.txt") pat = re.compile('<[^>]+>', re.S) print("start to add sentences...") for id in graph_data.get_node_ids(): node_info = graph_data.get_node_info_dict(id) short_description = node_info["properties"].get( "short_description", "") if not short_description: continue short_description = pat.sub('', short_description) short_descs = sent_tokenize(short_description) for short_desc in short_descs: short_desc = " ".join(short_desc.split()) str_rm_sign = classifier.preprocessor.remove_sign(short_desc) text = classifier.preprocessor.remove_stop_words(str_rm_sign) label = list(classifier.predict(text))[0] if label == "0": print(short_desc) with open(filter_sentence_path, "a", encoding='utf-8') as f: f.write(short_desc) f.write("\n") continue else: res.add_sentence_relation(short_desc, id, int(label)) res.save_new_graph_data()
def init_graph_data(self, graph_data_path): self.graph_data = GraphData.load(graph_data_path) self.code_element_kg_builder = CodeElementGraphDataBuilder( self.graph_data)
""" 对方法进行分类 将方法分为: accessor, mutator, creational, constructor, undefined五类 """ from sekg.graph.exporter.graph_data import GraphData, NodeInfo from project.utils.path_util import PathUtil from nltk.corpus import wordnet as wn pro_name = "jabref" graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3.4") graph_data: GraphData = GraphData.load(graph_data_path) accessor_key_word = ("get", "toString", "find", "search", "test", "contains", "is", "has", "show") mutator_key_word = ("set", "add", "delete", "move", "remove", "parse", "insert", "extract", "open") creational_key_word = ("copy", "construct", "create") nouns = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')} verbs = {x.name().split('.', 1)[0] for x in wn.all_synsets('v')} def get_pure_method_name_without_parameter(qualified_name=None): if qualified_name is None or qualified_name is "": raise ValueError("qualified name needed") qualified_name = qualified_name[:qualified_name.find("(")] result = qualified_name[qualified_name.rfind(".")+1:] return result # 根据一系列的key word去做最基本的划分
class JDKKGBuilder: """ build the skeleton KG from the JavaParser analysis result for the Project Source Code. It will include the package, class, interface, method. """ def __init__(self): self.graph_data = GraphData() self.code_element_kg_builder = CodeElementGraphDataBuilder( self.graph_data) def init_graph_data(self, graph_data_path): self.graph_data = GraphData.load(graph_data_path) self.code_element_kg_builder = CodeElementGraphDataBuilder( self.graph_data) def import_primary_type(self): type_list = CodeEntityCategory.java_primary_types() for item in type_list: code_element = { "qualified_name": item["name"], "api_type": CodeEntityCategory.CATEGORY_PRIMARY_TYPE, "short_description": item["description"] } self.add_primary_type(item["name"], **code_element) print(self.graph_data) a = self.graph_data self.graph_data.print_label_count() def add_primary_type(self, primary_type_name, **properties): properties["qualified_name"] = primary_type_name cate_labels = CodeEntityCategory.to_str_list( CodeEntityCategory.CATEGORY_PRIMARY_TYPE) builder = NodeBuilder() builder = builder.add_property( **properties).add_entity_label().add_labels( "code_element", *cate_labels) node_id = self.graph_data.add_node( node_id=GraphData.UNASSIGNED_NODE_ID, node_labels=builder.get_labels(), node_properties=builder.get_properties(), primary_property_name="qualified_name") return node_id def build_aliases(self): self.code_element_kg_builder.build_aliases_for_code_element() def infer_extra_relation(self): self.code_element_kg_builder.build_belong_to_relation() self.code_element_kg_builder.build_abstract_overloading_relation() # self.code_element_kg_builder.build_value_subclass_relation() self.code_element_kg_builder.build_belong_to_relation() self.code_element_kg_builder.build_override_relation() def save(self, graph_data_path): self.graph_data.save(graph_data_path) def import_normal_entity(self, api_entity_json): format_qualified_name = self.code_element_kg_builder.format_qualified_name( api_entity_json["qualified_name"]) if not format_qualified_name: return api_entity_json.pop("qualified_name") node_id = self.code_element_kg_builder.add_normal_code_element_entity( format_qualified_name, api_entity_json["api_type"], **api_entity_json) return node_id def import_parameter_entity(self, api_entity_json): extra_properties = {} qualified_name = api_entity_json["qualified_name"] short_description = api_entity_json["short_description"] value_type = qualified_name.split(" ")[0].strip() value_name = qualified_name.split(" ")[1].strip() ## todo: add all class node first, in case adding the parameter node without type info node_id = self.code_element_kg_builder.add_base_value_entity_node( value_type=value_type, value_name=value_name, short_description=short_description, entity_category=CodeEntityCategory.CATEGORY_PARAMETER, **extra_properties) if node_id == GraphData.UNASSIGNED_NODE_ID: print("fail to add parameter node %r" % (api_entity_json)) return node_id def import_return_value_entity(self, api_entity_json): extra_properties = {} qualified_name = api_entity_json["qualified_name"] short_description = api_entity_json["short_description"] value_type = qualified_name.split(" ")[0].strip() ## todo: add all class node first, in case adding the parameter node without type info node_id = self.code_element_kg_builder.add_base_value_entity_node( value_type=value_type, value_name="<R>", short_description=short_description, entity_category=CodeEntityCategory.CATEGORY_RETURN_VALUE, **extra_properties) if node_id == GraphData.UNASSIGNED_NODE_ID: print("fail to add parameter node %r" % (api_entity_json)) return node_id def import_exception_condition_entity(self, api_entity_json): extra_properties = {} qualified_name = api_entity_json["qualified_name"] short_description = api_entity_json["short_description"] value_type = qualified_name.split(" ")[0].strip() ## todo: add all class node first, in case adding the parameter node without type info node_id = self.code_element_kg_builder.add_base_value_entity_node( value_type=value_type, value_name="<E>", short_description=short_description, entity_category=CodeEntityCategory.CATEGORY_RETURN_VALUE, **extra_properties) if node_id == GraphData.UNASSIGNED_NODE_ID: print("fail to add parameter node %r" % (api_entity_json)) return node_id def import_construct_method_entity(self, api_entity_json): format_qualified_name = self.code_element_kg_builder.format_qualified_name( api_entity_json["qualified_name"]) method_name = self.code_element_kg_builder.parse_construct_to_javaparser_style( format_qualified_name) if not method_name: return GraphData.UNASSIGNED_NODE_ID api_entity_json.pop("qualified_name") node_id = self.code_element_kg_builder.add_normal_code_element_entity( method_name, api_entity_json["api_type"], **api_entity_json) return node_id def import_qualified_field_entity(self, api_entity_json): # print("import_qualified_field_entity %r %r" % (api_entity_json["qualified_name"], api_entity_json)) qualified_name = self.code_element_kg_builder.format_qualified_name( api_entity_json["qualified_name"]) if not qualified_name: print("import_qualified_field_entity %r %r" % (api_entity_json["qualified_name"], api_entity_json)) return GraphData.UNASSIGNED_NODE_ID api_entity_json.pop("qualified_name") api_entity_json.pop("api_type") node_id = self.code_element_kg_builder.add_normal_code_element_entity( qualified_name, CodeEntityCategory.CATEGORY_FIELD_OF_CLASS, **api_entity_json) return node_id def import_qualified_enum_constants_entity(self, api_entity_json): # print("import_qualified_field_entity %r %r" % (api_entity_json["qualified_name"], api_entity_json)) qualified_name = self.code_element_kg_builder.format_qualified_name( api_entity_json["qualified_name"]) if not qualified_name: print("import_qualified_field_entity %r %r" % (api_entity_json["qualified_name"], api_entity_json)) return GraphData.UNASSIGNED_NODE_ID api_entity_json.pop("qualified_name") api_entity_json.pop("api_type") node_id = self.code_element_kg_builder.add_normal_code_element_entity( qualified_name, CodeEntityCategory.CATEGORY_ENUM_CONSTANTS, **api_entity_json) return node_id def import_jdk_from_api_table(self, session): print("start import_jdk_from_api_table ") # api_entity_list = session.query(APIEntity).filter(APIEntity.id > 85000).limit(1000).all() api_entity_list = session.query(APIEntity).all() api_id_to_node_id_map = {} for entity_info_row in api_entity_list: api_entity_json = dict(entity_info_row.__dict__) api_entity_json.pop('_sa_instance_state', None) api_id = api_entity_json["id"] qualified_name = api_entity_json["qualified_name"] api_type = api_entity_json["api_type"] if self.is_jdk_api(qualified_name) == False: if self.is_android_support(qualified_name): continue if self.is_android_core_api(qualified_name): continue # if self.is_android_core_api(qualified_name) == False: print("Not jdk %d %s %r " % (api_id, qualified_name, CodeEntityCategory.to_str(api_type))) continue normal_entity_types = { CodeEntityCategory.CATEGORY_CLASS, CodeEntityCategory.CATEGORY_PACKAGE, CodeEntityCategory.CATEGORY_METHOD, CodeEntityCategory.CATEGORY_INTERFACE, CodeEntityCategory.CATEGORY_EXCEPTION_CLASS, CodeEntityCategory.CATEGORY_ENUM_CLASS, CodeEntityCategory.CATEGORY_ERROR_CLASS, CodeEntityCategory.CATEGORY_ANNOTATION_CLASS, } node_id = GraphData.UNASSIGNED_NODE_ID if api_type in normal_entity_types: node_id = self.import_normal_entity(api_entity_json) if api_type == CodeEntityCategory.CATEGORY_CONSTRUCT_METHOD: node_id = self.import_construct_method_entity(api_entity_json) if api_type == CodeEntityCategory.CATEGORY_FIELD_OF_CLASS: node_id = self.import_qualified_field_entity(api_entity_json) if api_type == CodeEntityCategory.CATEGORY_ENUM_CONSTANTS: node_id = self.import_qualified_enum_constants_entity( api_entity_json) if api_type == CodeEntityCategory.CATEGORY_PRIMARY_TYPE: node_id = self.add_primary_type( primary_type_name=qualified_name, **api_entity_json) if api_type == CodeEntityCategory.CATEGORY_PARAMETER: node_id = self.import_parameter_entity(api_entity_json) if api_type == CodeEntityCategory.CATEGORY_RETURN_VALUE: node_id = self.import_return_value_entity(api_entity_json) if api_type == CodeEntityCategory.CATEGORY_EXCEPTION_CONDITION: node_id = self.import_exception_condition_entity( api_entity_json) if node_id == GraphData.UNASSIGNED_NODE_ID: print("Adding fail %d %s %r " % (api_id, qualified_name, CodeEntityCategory.to_str(api_type))) continue api_id_to_node_id_map[api_id] = node_id self.graph_data.print_graph_info() print("end import_jdk_from_api_table ") return api_id_to_node_id_map def import_android_from_api_table(self, session): print("start import android api from jdk table") # api_entity_list = session.query(APIEntity).filter(APIEntity.id > 85000).limit(1000).all() api_entity_list = session.query(APIEntity).all() api_id_to_node_id_map = {} for entity_info_row in api_entity_list: api_entity_json = dict(entity_info_row.__dict__) api_entity_json.pop('_sa_instance_state', None) api_id = api_entity_json["id"] qualified_name = api_entity_json["qualified_name"] api_type = api_entity_json["api_type"] if self.is_android_support(qualified_name): continue if self.is_jdk_api( qualified_name) == False and self.is_android_core_api( qualified_name) == False: # if self.is_android_core_api(qualified_name) == False: print("Not android or JDK API %d %s %r " % (api_id, qualified_name, CodeEntityCategory.to_str(api_type))) continue normal_entity_types = { CodeEntityCategory.CATEGORY_CLASS, CodeEntityCategory.CATEGORY_PACKAGE, CodeEntityCategory.CATEGORY_METHOD, CodeEntityCategory.CATEGORY_INTERFACE, CodeEntityCategory.CATEGORY_EXCEPTION_CLASS, CodeEntityCategory.CATEGORY_ENUM_CLASS, CodeEntityCategory.CATEGORY_ERROR_CLASS, CodeEntityCategory.CATEGORY_ANNOTATION_CLASS, } node_id = GraphData.UNASSIGNED_NODE_ID if api_type in normal_entity_types: node_id = self.import_normal_entity(api_entity_json) if api_type == CodeEntityCategory.CATEGORY_CONSTRUCT_METHOD: node_id = self.import_construct_method_entity(api_entity_json) if api_type == CodeEntityCategory.CATEGORY_FIELD_OF_CLASS: node_id = self.import_qualified_field_entity(api_entity_json) if api_type == CodeEntityCategory.CATEGORY_ENUM_CONSTANTS: node_id = self.import_qualified_enum_constants_entity( api_entity_json) if api_type == CodeEntityCategory.CATEGORY_PRIMARY_TYPE: node_id = self.add_primary_type( primary_type_name=qualified_name, **api_entity_json) if api_type == CodeEntityCategory.CATEGORY_PARAMETER: node_id = self.import_parameter_entity(api_entity_json) if api_type == CodeEntityCategory.CATEGORY_RETURN_VALUE: node_id = self.import_return_value_entity(api_entity_json) if api_type == CodeEntityCategory.CATEGORY_EXCEPTION_CONDITION: node_id = self.import_exception_condition_entity( api_entity_json) if node_id == GraphData.UNASSIGNED_NODE_ID: print("Adding fail %d %s %r " % (api_id, qualified_name, CodeEntityCategory.to_str(api_type))) continue api_id_to_node_id_map[api_id] = node_id self.graph_data.print_graph_info() print("end import_jdk_from_api_table ") return api_id_to_node_id_map def import_relation_from_jdk_table(self, session, api_id_to_node_id_map): print("start import jdk relation") self.graph_data.print_graph_info() valid_api_types = CodeEntityRelationCategory.relation_set() for relation_type in valid_api_types: relation_str = CodeEntityRelationCategory.to_str(relation_type) print("start import relation %s" % (relation_str)) api_relation_list = session.query(APIRelation).filter( APIRelation.relation_type == relation_type).all() for relation in api_relation_list: if relation.start_api_id not in api_id_to_node_id_map: print("start_id %d can't found its node id" % (relation.start_api_id)) continue if relation.end_api_id not in api_id_to_node_id_map: print("end_id %d can't found its node id" % (relation.end_api_id)) continue self.graph_data.add_relation( startId=api_id_to_node_id_map[relation.start_api_id], endId=api_id_to_node_id_map[relation.end_api_id], relationType=relation_str) print("end import jdk relation") self.graph_data.print_graph_info() def is_jdk_api(self, qualified_name): if qualified_name.startswith("java."): return True if qualified_name.startswith("javax."): return True if qualified_name.startswith("org.w3c.dom"): return True if qualified_name.startswith("org.xml.sax"): return True if qualified_name.startswith("org.ietf"): return True if qualified_name.startswith("org.omg"): return True for primary in CodeEntityCategory.JAVA_PRIMARY_TYPE_SET: if qualified_name.startswith(primary): return True # the Generic type parameter,eg. T element, T[] if len(qualified_name.strip("[]").split(" ")[0]) == 1: return True return False def is_android_support(self, qualified_name): if qualified_name.startswith("androidx"): return True if qualified_name.startswith("android.support"): return True return False def is_android_core_api(self, qualified_name): if self.is_android_support(qualified_name): return False if qualified_name.startswith("android"): return True if qualified_name.startswith("com.android.internal.util"): return True if qualified_name.startswith("dalvik."): return True if qualified_name.startswith("junit."): return True if qualified_name.startswith("org.xmlpull"): return True if qualified_name.startswith("org.json"): return True if qualified_name.startswith("org.apache"): return True return False def add_source_label(self, source_label): self.code_element_kg_builder.add_source_label(source_label)
def __init__(self): self.graph_data = GraphData() self.code_element_kg_builder = CodeElementGraphDataBuilder( self.graph_data)
def build_doc(self, graph_data_path, output_doc_collection_path=None): graph_data_instance = GraphData.load(str(graph_data_path)) builder = GraphNodeDocumentBuilder(graph_data=graph_data_instance) return builder.build_doc_for_kg(output_doc_collection_path)
"Description": api_sample_code["Description"] }, primary_property_name="Description") # api_node_id = graph_data.find_one_node_by_property(property_name="qualified_name", # property_value=api_sample_code["API"])["properties"][ # "id"] # code_node_id = graph_data.find_one_node_by_property(property_name="Code", property_value=api_sample_code["Code"])["_node_id"] # description_node_id = graph_data.find_one_node_by_property(property_name="Description", property_value=api_sample_code["Description"])["_node_id"] graph_data.add_relation(startId=api_node_id, relationType="has sample code", endId=code_node_id) graph_data.add_relation(startId=code_node_id, relationType="has description", endId=description_node_id) else: print(api_sample_code["Id"]) graph_data.save(output_graph_data_path) if __name__ == "__main__": # graph_data_path = str(Path(GRAPH_DATA_DIR) / 'jdk8_sample_code.v1.graph') # kg_impoter(graph_data_path) graph_data = GraphData.load( str(Path(GRAPH_DATA_DIR) / 'jdk8_sample_code.v1.graph')) # # ids = (489,) # # results = graph_data.find_one_node_by_property(property_name="id", property_value=1869) # # print(results["id"]) graph_data.print_graph_info() # create_sample_code_kg(graph_data)
class GenericKGFusion: INVALID_TEXTS = { "scientific article", "wikimedia template", "wikimedia list article", "wikipedia template", "wikibase wikis", "wikimedia", "wikibase", "wikidata" } INVALID_SUBCLASS_ITEM_ID = set([ "Q11424", # film "Q15138389", # wiki "Q7187", # gene ]) DEFAULT_FILTER_CONTEXT_SCORE = 0.8 DEFAULT_FILTER_TOPIC_SCORE = 0.9 DEFAULT_ACCEPTABLE_TOPIC_SCORE = 0.95 DEFAULT_ACCEPTABLE_CONTEXT_SCORE = 0.85 DEFAULT_PROXY_SERVER = "http://127.0.0.1:1080" def __init__(self, filter_score=DEFAULT_FILTER_CONTEXT_SCORE, proxy_server=DEFAULT_PROXY_SERVER): self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) self.wikipedia_cache = {} self.fetcher = AsyncWikiSearcher(proxy_server) self.graph_data = GraphData() self.wikidata_property_table = WikiDataPropertyTable.get_instance() self.embedding = {} self.filter_score = filter_score self.NLP = SpacyNLPFactory.create_simple_nlp_pipeline() self.all_domain_vector = {} def init_wd_from_cache(self, title_save_path=None, item_save_path=None): self.fetcher.init_from_cache(title_save_path=title_save_path, item_save_path=item_save_path) print("Init from cache...") def init_wikipedia_contex(self, wikipedia_context_path=None): # TODO 将wikipedia的内容加到wikisearcher这个类里,就不用在GenericKGFusion中load了 if wikipedia_context_path is not None and Path( wikipedia_context_path).exists(): with open(wikipedia_context_path, "rb") as f: self.wikipedia_cache = pickle.load(f) else: print('no such wikipedia_context_path {}'.format( wikipedia_context_path)) def export_wd_cache(self, title_save_path, item_save_path): self.fetcher.save(item_save_path=item_save_path, title_save_path=title_save_path) def load_word_embedding(self, emb_path): wv = KeyedVectors.load(emb_path) self.embedding = {k: wv[k] for k in wv.vocab.keys()} def load_w2v_model(self, w2v_path): self.w2v_model = AVGW2VFLModel.load(w2v_path) def init_graph_data(self, graph_data_path): self.graph_data = GraphData.load(graph_data_path) def fetch_wikidata_by_name(self, terms, title_save_path=None, item_save_path=None): """ search with some terms and find the candidate wikidata item list for the term, and cache all the possible wikidata item for the item. eg. for term: "apple", we will search it in wikidata.org by API and get the returned search result list(maybe 10 result). the search result for keywords will be cached. And we we retrieve all 10 candidate wikidata item info. :param item_save_path: the wikidata item info cache path :param title_save_path: the search result by title saving path :param terms: a list of str or a set of str standing for concepts. :return: """ self.fetcher.init_from_cache(title_save_path=title_save_path, item_save_path=item_save_path) terms = {self.lemmatizer.noun(term)[0].lower() for term in terms} print( "need to fetch %r term wiki titles, %r are already cache, actual %r need to fetch" % (len(terms), len(self.fetcher.title_cache.keys() & terms), len(terms) - len(self.fetcher.title_cache.keys() & terms))) term_titles = self.fetcher.search_title(terms) if title_save_path is not None: self.fetcher.save(title_save_path=title_save_path) ids = self.get_valid_wikidata_item(term_titles) term_wikiitems = self.fetch_wikidata_by_id(ids, item_save_path) return term_titles, term_wikiitems @staticmethod def is_need_to_fetch_wikidata_item(item): INVALID_TEXTS = [ "scientific article", "wikimedia template", "wikimedia list article", "wikipedia template", "wikibase wikis", "wikimedia" ] snippet = item["snippet"].lower() for invalid_text in INVALID_TEXTS: if invalid_text in snippet: return False return True @staticmethod def get_valid_wikidata_item(term_titles): """ some search results for wikidata are not need to search, for example, the item has "scientific article" in description. :param term_titles: :return: """ valid_wikidata_ids = set([]) for v in term_titles.values(): for item in v: if GenericKGFusion.is_need_to_fetch_wikidata_item( item) == False: continue valid_wikidata_ids.add(item["title"]) return valid_wikidata_ids def fetch_wikidata_by_id(self, ids, item_save_path=None): print( "need to fetch wikidata items num=%r, %r are already cache, actual %r need to fetch" % (len(ids), len(self.fetcher.item_cache.keys() & ids), len(ids) - len(self.fetcher.item_cache.keys() & ids))) term_wikiitems = self.fetcher.fetch_item(ids) if item_save_path is not None: self.fetcher.save(item_save_path=item_save_path) return term_wikiitems # def compute_topic_vector(self): # topic_words = [] # for node_id in self.graph_data.get_node_ids_by_label(DomainConstant.LABEL_DOMAIN_TERM): # try: # node_json = self.graph_data.get_node_info_dict(node_id=node_id) # if not node_json: # continue # node_properties = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES] # lemma = node_properties[PropertyConstant.LEMMA] # aliases = node_properties.get(PropertyConstant.ALIAS, []) # aliases_en = node_properties.get("aliases_en", []) # description_en = node_properties.get("descriptions_en", "") # name = node_properties.get("name", "") # topic_words.append(lemma) # topic_words.extend(aliases) # topic_words.extend(aliases_en) # topic_words.append(description_en) # topic_words.append(name) # except: # traceback.print_exc() # topic_text = " ".join(topic_words).lower() # # if len(topic_text) == 0: # return None # words = [w for w in topic_text.split() if w] # if len(words) == 0: # return None # vec_des = sum([self.embedding.get(w, np.zeros([100])) for w in words]) / len(words) # # return vec_des # # def compute_wikidata_vector(self, wikidata_item, term_wikiitems, node_json): # relation_text = self.generate_relations_text(wikidata_item, term_wikiitems) # description = wikidata_item.get_en_description() # en_name = wikidata_item.get_en_name() # en_aliases = wikidata_item.get_en_aliases() # # description = " ".join([en_name, " ".join(en_aliases), description, relation_text]) # # words = [token.lemma_.lower() for token in self.NLP(description) if # token.is_digit == False and token.is_stop == False and token.is_punct == False] # # domain_term_name = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES][PropertyConstant.LEMMA] # # removal_words = set(domain_term_name.lower().split()) # words = [w for w in words if w not in removal_words] # # if len(words) == 0: # return None # # todo: the size of vector should be adjust # vec_des = sum([self.embedding.get(w, np.zeros([100])) for w in words]) / len(words) # # return vec_des # # def __score_topic(self, topic_vector, wikidata_item, term_wikiitems, node_json): # # wikidata_vector = self.compute_wikidata_vector(wikidata_item, term_wikiitems, node_json) # return self.compute_sim_for_two_vectors(wikidata_vector, topic_vector) # # def __score_context(self, node_json, wikidata_item, term_wikiitems): # relation_text = self.generate_relations_text(wikidata_item, term_wikiitems) # description = wikidata_item.get_en_description() # en_name = wikidata_item.get_en_name() # en_aliases = wikidata_item.get_en_aliases() # # description = " ".join([en_name, " ".join(en_aliases), description, relation_text]) # # domain_term_name = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES][PropertyConstant.LEMMA] # # name = self.get_compare_name_for_domain_term(node_json) # # removal_words = set(domain_term_name.lower().split()) # # if len(description) == 0 or len(name) == 0: # return 0 # # words = list(set( # # [token.lemma_.lower() for token in self.NLP(description) if # # token.is_digit == False and token.is_stop == False])) # words = [token.lemma_.lower() for token in self.NLP(description) if # token.is_digit == False and token.is_stop == False and token.is_punct == False] # words = [w for w in words if w not in removal_words] # # if len(words) == 0: # return 0 # vec_des = sum([self.embedding.get(w, np.zeros([100])) for w in words]) / len(words) # # name_words = list( # # set([token.lemma_.lower() for token in self.NLP(name) if # # token.is_digit == False and token.is_stop == False])) # name_words = [token.lemma_.lower() for token in self.NLP(name) if # token.is_digit == False and token.is_stop == False] # # if len(name_words) == 0: # return 0 # vec_term = sum([self.embedding.get(w, np.zeros([100])) for w in name_words]) / len(name_words) # # return self.compute_sim_for_two_vectors(vec_des, vec_term) # # def compute_sim_for_two_vectors(self, vec_des, vec_term): # norm_des = np.linalg.norm(vec_des) # norm_term = np.linalg.norm(vec_term) # if norm_des == 0 or norm_term == 0: # return 0 # return 0.5 + vec_des.dot(vec_term) / (norm_des * norm_term) / 2 # # def get_compare_name_for_domain_term(self, node_json): # domain_term_id = node_json[GraphData.DEFAULT_KEY_NODE_ID] # # name = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES].get(PropertyConstant.LEMMA, "") # aliases = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES].get(PropertyConstant.ALIAS, []) # aliases_en = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES].get("aliases_en", []) # # other_names = [name] # other_names.extend(aliases) # other_names.extend(aliases_en) # # out_relations = self.graph_data.get_all_out_relations(node_id=domain_term_id) # in_relations = self.graph_data.get_all_in_relations(node_id=domain_term_id) # domain_term_node_ids = self.graph_data.label_to_ids_map[DomainConstant.LABEL_DOMAIN_TERM] # id_set = set([]) # for (start_id, r, end_id) in out_relations: # if end_id in domain_term_node_ids: # id_set.add(end_id) # for (start_id, r, end_id) in in_relations: # if start_id in domain_term_node_ids: # id_set.add(start_id) # id_set.add(domain_term_id) # for id in id_set: # temp_node_json = self.graph_data.get_node_info_dict(node_id=id) # other_names.append(temp_node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES].get(PropertyConstant.LEMMA, "")) # name = " ".join(other_names) # return name def add_wikidata_item(self, item: WikiDataItem): """ 在图中添加一个wikidata的节点,没有添加relation add a new term to graph data :param term: the term added to GraphData :return: the node_id fo the added term node """ ori_node_json = self.graph_data.find_one_node_by_property( WikiDataConstance.PRIMARY_PROPERTY_NAME, item.wd_item_id) if ori_node_json: # print(ori_node_json) # print('no new wiki node!! node %d has fused wiki_node %s' % (ori_node_json["id"], item.wd_item_id)) return ori_node_json["id"] # print("add new wikinode %s" % (item.wd_item_id)) node_labels = [WikiDataConstance.LABEL_WIKIDATA] node_properties = { WikiDataConstance.PRIMARY_PROPERTY_NAME: item.wd_item_id, WikiDataConstance.NAME: item.get_en_name(), PropertyConstant.ALIAS: set(item.get_en_aliases()), } item.get_relation_property_name_list() relation_property_set = set(item.relation_property_name_list) pure_property_set = set(item.get_non_relation_property_name_list()) valid_property_dict = {} for p, v in item.data_dict.items(): if p in relation_property_set: continue if p in pure_property_set: p = self.wikidata_property_table.property_id_2_name(p) if p == None: continue valid_property_dict[p] = v wikidata_node_id = self.graph_data.add_node( node_labels=node_labels, node_properties=dict(valid_property_dict, **node_properties), primary_property_name=WikiDataConstance.PRIMARY_PROPERTY_NAME) return wikidata_node_id def add_all_wiki_nodes(self): print("start add all wiki nodes.......") self.graph_data.create_index_on_property( WikiDataConstance.PRIMARY_PROPERTY_NAME) term_wikiitems = self.fetcher.item_cache wikiiterms_ids = term_wikiitems.keys() self.add_wikidata_items(wikiiterms_ids) self.graph_data.refresh_indexer() def simple_fuse(self, ): """ simple fuse wiki data, the graph is with all wikidata nodes, we need to calculate similarity to filter some :return: """ record = [] valid_domain_id_set = self.graph_data.get_node_ids_by_label( DomainConstant.LABEL_DOMAIN_TERM) i = 0 valid_wiki_id_set = self.graph_data.get_node_ids_by_label("wikidata") valid_wiki_index = np.array( list( self.w2v_model.preprocess_doc_collection. doc_id_set_2_doc_index_set(valid_wiki_id_set))) print("valid_wiki_index size: ", valid_wiki_index.size) doc_model = self.w2v_model.avg_w2v_model_field_map["doc"] for node_id in valid_domain_id_set: try: node_json = self.graph_data.get_node_info_dict(node_id=node_id) if not node_json: continue node_properties = node_json[ GraphData.DEFAULT_KEY_NODE_PROPERTIES] lemma = node_properties[PropertyConstant.LEMMA] alias_set = node_properties[PropertyConstant.ALIAS] term_name = node_properties["term_name"] alias_set.add(lemma) alias_set.add(term_name) text = " ".join(list(alias_set)) domain_words = self.w2v_model.preprocessor.clean(text) domain_vec = self.w2v_model.get_avg_w2v_vec(domain_words) score_vector = ( doc_model.similar_by_vector(domain_vec, topn=None) + 1) / 2 sort_index = np.argsort(-score_vector) score_vector = score_vector[sort_index] over_thred = np.where(score_vector > 0.8) top_wiki_valid = np.intersect1d(over_thred, valid_wiki_index) if top_wiki_valid.size: print("number {}:{} ,Done!".format(i, node_id)) # score_vector = score_vector[top_wiki_valid] sorted_index_scores = np.array( (top_wiki_valid, score_vector[top_wiki_valid])).T retrieval_results = [] rank = 0 for (doc_index, score) in sorted_index_scores: entity_document = self.w2v_model.doc_index2doc(doc_index) if rank >= 5: break if entity_document is None: continue wiki_id = entity_document.get_document_id() rank += 1 retrieval_results.append((wiki_id, score)) for wiki_id, score in retrieval_results: wiki_node_json = self.graph_data.get_node_info_dict( wiki_id) record.append({ "name": wiki_node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES] ["wikidata_name"], "alias": wiki_node_json[ GraphData.DEFAULT_KEY_NODE_PROPERTIES]["alias_en"], "description": wiki_node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES] ["description_en"], "domain term": node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES] ["qualified_name"], "score": score, "link": True, "domain_id": node_id, "wd_item_id": wiki_node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES] ["wd_item_id"] }) self.graph_data.add_relation(startId=node_id, endId=wiki_id, relationType="related to") except Exception: traceback.print_exc() self.delete_isolated_nodes_by_label(WikiDataConstance.LABEL_WIKIDATA) self.graph_data.refresh_indexer() return record @staticmethod def get_wikidata_item_ids_by_relation(wikidata_item: WikiDataItem, r): id_set = set([]) end = wikidata_item.data_dict.get(r, []) if type(end) == list: for e in end: id_set.add(e) else: id_set.add(end) return id_set def generate_relations_text(self, wikidata_item, term_wikiitems): text = [] for r in wikidata_item.relation_property_name_list: relation_name = self.wikidata_property_table.property_id_2_name(r) if relation_name == None: relation_name = r end = wikidata_item.data_dict[r] if type(end) == list: for e_wd_item_id in end: if self.is_valid_wikidata_item_id(e_wd_item_id): neibour_item = term_wikiitems.get(e_wd_item_id, None) if neibour_item != None: text.append(neibour_item.get_en_name()) # if relation_name in {"subclass of", "instance of", "part of"}: # text.append(neibour_item.get_en_description()) text.append(neibour_item.get_en_description()) else: text.append(end) text.append(relation_name) else: if self.is_valid_wikidata_item_id(end): neibour_item = term_wikiitems.get(end, None) if neibour_item != None: text.append(neibour_item.get_en_name()) # if relation_name in {"subclass of", "instance of", "part of"}: # text.append(neibour_item.get_en_description()) text.append(neibour_item.get_en_description()) else: text.append(end) text.append(relation_name) return " ".join(text) def is_valid_wikidata_item_id(self, wd_item_id): try: if wd_item_id.startswith("Q") and wd_item_id[1:].isdigit(): return True return False except: return False def get_all_neighbours_id(self, item): neighbours = set() for r in item.relation_property_name_list: end = item.data_dict[r] if type(end) == list: for e in end: if e[0] == "Q" or e[0] == "P": neighbours.add(e) else: if end[0] == "Q" or end[0] == "P": neighbours.add(end) return neighbours def get_all_neighbours_id_by_item_id(self, item_id): neighbours = set() item = self.fetcher.item_cache.get(item_id, None) if item == None: return set() neighbours = self.get_all_neighbours_id(item) return neighbours def fetch_valid_wikidata_item_neibours_from_all_term_titles( self, item_save_path): """ some search results for wikidata are not need to search, for example, the item has "scientific article" in description. :param term_titles: :return: """ term_titles = self.fetcher.title_cache valid_wikidata_ids = GenericKGFusion.get_valid_wikidata_item( term_titles) nerbours = set([]) for valid_id in valid_wikidata_ids: nerbours.update(self.get_all_neighbours_id_by_item_id(valid_id)) return self.fetch_wikidata_by_id(nerbours, item_save_path) def add_wikidata_items(self, wd_item_ids): term_wikiitems = self.fetcher.item_cache self.graph_data.refresh_indexer() i = 0 for wd_item_id in wd_item_ids: i += 1 print(i, ": ", wd_item_id) self.add_wikidata_item(term_wikiitems[wd_item_id]) self.build_relation_between_wikidata_node_in_graph(term_wikiitems) def build_relation_between_wikidata_node_in_graph(self, term_wikiitems): wikidata_node_ids = self.graph_data.get_node_ids_by_label( WikiDataConstance.LABEL_WIKIDATA) wd_item_id_2_node_id_map = {} node_id_2_wd_item_id_map = {} for node_id in wikidata_node_ids: wikidata_node = self.graph_data.get_node_info_dict(node_id) wd_item_id = wikidata_node[GraphData.DEFAULT_KEY_NODE_PROPERTIES][ WikiDataConstance.PRIMARY_PROPERTY_NAME] wd_item_id_2_node_id_map[wd_item_id] = node_id node_id_2_wd_item_id_map[node_id] = wd_item_id for start_wd_item_id, start_node_id in wd_item_id_2_node_id_map.items( ): start_wikidata_item = term_wikiitems.get(start_wd_item_id, None) if start_wikidata_item == None: continue for r_id in start_wikidata_item.relation_property_name_list: end_wd_ids = self.get_wikidata_item_ids_by_relation( start_wikidata_item, r_id) relation_name = self.wikidata_property_table.property_id_2_name( r_id) if relation_name == None: continue for end_wd_id in end_wd_ids: end_node_id = wd_item_id_2_node_id_map.get(end_wd_id, None) if end_node_id == None: continue if start_node_id == end_node_id: continue self.graph_data.add_relation(start_node_id, relation_name, end_node_id) def save(self, graph_data_path): self.graph_data.save(graph_data_path) print("save ", type(self.graph_data)) def is_valid_wikidata_item(self, item): for text in self.INVALID_TEXTS: en_name = item.get_en_name().lower() if text in en_name: return False end_wd_ids = self.get_wikidata_item_ids_by_relation(item, "P31") for end_wd in end_wd_ids: if end_wd in self.INVALID_SUBCLASS_ITEM_ID: return False return True def fetch_wikidata_by_name_and_cache_neibours(self, terms, title_save_path, item_save_path): self.fetch_wikidata_by_name(terms, item_save_path=item_save_path, title_save_path=title_save_path) self.fetch_valid_wikidata_item_neibours_from_all_term_titles( item_save_path=item_save_path) def delete_isolated_nodes_by_label(self, label): label_ids = self.graph_data.get_node_ids_by_label(label) remove_id = set() for id in label_ids: in_relations = self.graph_data.get_all_in_relations(id) out_relations = self.graph_data.get_all_out_relations(id) if not in_relations and not out_relations: remove_id.add(id) print("remove {}: {}".format(label, id)) for id in remove_id: self.graph_data.remove_node(id) print("remove {} wiki nodes".format(len(remove_id)))
def init_graph_data(self, graph_data_path): self.graph_data = GraphData.load(graph_data_path)
class SkeletonKGBuilder: """ build the skeleton KG from the JavaParser analysis result for the Project Source Code. It will include the package, class, interface, method. """ def __init__(self): self.graph_data = GraphData() self.code_element_kg_builder = CodeElementGraphDataBuilder(self.graph_data) def init_graph_data(self, graph_data_path): self.graph_data = GraphData.load(graph_data_path) self.code_element_kg_builder = CodeElementGraphDataBuilder(self.graph_data) def import_primary_type(self): type_list = CodeEntityCategory.java_primary_types() for item in type_list: code_element = { "qualified_name": item["name"], "api_type": CodeEntityCategory.CATEGORY_PRIMARY_TYPE, "short_description": item["description"] } cate_labels = CodeEntityCategory.to_str_list(code_element["api_type"]) builder = NodeBuilder() builder = builder.add_property(**code_element).add_entity_label().add_labels("code_element", *cate_labels) self.graph_data.add_node( node_id=GraphData.UNASSIGNED_NODE_ID, node_labels=builder.get_labels(), node_properties=builder.get_properties(), primary_property_name="qualified_name") self.graph_data.print_graph_info() def import_normal_entity_json(self, entity_json_path): print("start import normal entity json") with open(entity_json_path, "r", encoding='UTF-8') as f: code_list = json.load(f) record_num = len(code_list) print("load json complete size=%d" % record_num) fail_num = 0 name_mark = set([]) for index, code_element in enumerate(code_list): format_qualified_name = self.code_element_kg_builder.format_qualified_name(code_element["qualified_name"]) if not format_qualified_name: print("not __valid name %r" % code_element["qualified_name"]) fail_num += 1 continue code_element["qualified_name"] = format_qualified_name if code_element["qualified_name"] in name_mark: continue name_mark.add(code_element["qualified_name"]) code_element.pop("qualified_name") node_id = self.code_element_kg_builder.add_normal_code_element_entity(format_qualified_name, code_element["type"], **code_element) print("total=%d fail_num=%d success_num=%d" % (record_num, fail_num, record_num - fail_num)) self.graph_data.print_graph_info() print("end import normal entity json") def import_normal_entity_relation_json(self, entity_relation_json_path): print("start import normal entity relations json") print(self.graph_data) self.graph_data.print_label_count() with open(entity_relation_json_path, "r", encoding='UTF-8') as f: code_relation_list = json.load(f) record_num = len(code_relation_list) print("load json complete size=%d" % record_num) fail_num = 0 for relation_json in code_relation_list: relation_type = relation_json["relation_type"] if relation_type == CodeEntityRelationCategory.RELATION_CATEGORY_METHOD_IMPLEMENT_CODE_CALL_METHOD: success = self.code_element_kg_builder.add_method_call_relation(relation_json["start_name"], relation_json["end_name"]) if success == False: fail_num = fail_num + 1 continue if relation_type == relation_type == CodeEntityRelationCategory.RELATION_CATEGORY_METHOD_IMPLEMENT_CODE_USE_CLASS: continue if relation_type == CodeEntityRelationCategory.RELATION_CATEGORY_BELONG_TO or relation_type == CodeEntityRelationCategory.RELATION_CATEGORY_EXTENDS or relation_type == CodeEntityRelationCategory.RELATION_CATEGORY_IMPLEMENTS: success = self.code_element_kg_builder.add_relation_by_creating_not_exist_entity( relation_json["start_name"], relation_json["end_name"], relation_type=relation_type ) if success == False: fail_num = fail_num + 1 continue success = self.code_element_kg_builder.add_relation_by_not_creating_entity(relation_json["start_name"], relation_json["end_name"], relation_type) if success == False: fail_num = fail_num + 1 print("fail num=%d" % fail_num) self.graph_data.print_graph_info() print("end import normal entity relations json") def import_field_entity(self, entity_json_path, entity_relation_json_path): print("start import field entity json") print(self.graph_data) self.graph_data.print_label_count() with open(entity_json_path, "r", encoding='UTF-8') as f: code_list = json.load(f) record_num = len(code_list) print("load json complete size=%d" % record_num) with open(entity_relation_json_path, "r", encoding='UTF-8') as f: relation_list = json.load(f) relation_num = len(relation_list) print("load json complete entity relation size=%d" % relation_num) old_id_to_new_node_id_map = {} for index, code_element in enumerate(code_list): field_id = code_element["id"] field_type = code_element["field_type"] field_name = code_element["field_name"] # short_description = code_element["description"] short_description = "" # the field json has not description new_field_node_id = self.code_element_kg_builder.add_base_value_entity_node(value_type=field_type, value_name=field_name, short_description=short_description, entity_category=CodeEntityCategory.CATEGORY_FIELD) old_id_to_new_node_id_map[field_id] = new_field_node_id for r in relation_list: field_node_id = old_id_to_new_node_id_map[r["field_id"]] class_qualified_name = self.code_element_kg_builder.format_qualified_name(r["belong_class_interface_name"]) node_json = self.graph_data.find_one_node_by_property("qualified_name", class_qualified_name) if node_json is None: parent_node_id = self.code_element_kg_builder.add_type_node(class_qualified_name) else: parent_node_id = node_json[GraphData.DEFAULT_KEY_NODE_ID] if self.graph_data.exist_relation(parent_node_id, CodeEntityRelationCategory.to_str( CodeEntityRelationCategory.RELATION_CATEGORY_HAS_FIELD), field_node_id): print("------") print(r, field_node_id, node_json) self.graph_data.add_relation(parent_node_id, CodeEntityRelationCategory.to_str( CodeEntityRelationCategory.RELATION_CATEGORY_HAS_FIELD), field_node_id) self.graph_data.print_graph_info() print("end import field entity json") def import_parameter_entity(self, entity_json_path, entity_relation_json_path): print("start import parameter entity") self.graph_data.print_graph_info() with open(entity_json_path, "r", encoding='UTF-8') as f: code_list = json.load(f) record_num = len(code_list) print("load json complete entity size=%d" % record_num) with open(entity_relation_json_path, "r", encoding='UTF-8') as f: relation_list = json.load(f) record_num = len(relation_list) print("load json complete entity relation size=%d" % record_num) old_id_to_new_node_id_map = {} for index, code_element in enumerate(code_list): parameter_id = code_element["id"] parameter_type = code_element["parameter_type"] parameter_name = code_element["parameter_name"] short_description = code_element["description"] parameter_node_id = self.code_element_kg_builder.add_base_value_entity_node(value_type=parameter_type, value_name=parameter_name, short_description=short_description, entity_category=CodeEntityCategory.CATEGORY_PARAMETER) old_id_to_new_node_id_map[parameter_id] = parameter_node_id for r in relation_list: parameter_node_id = old_id_to_new_node_id_map[r["parameter_id"]] method_qualified_name = self.code_element_kg_builder.format_qualified_name(r["method_name"]) if not method_qualified_name: print("not __valid method name %r" % method_qualified_name) continue node_json = self.graph_data.find_one_node_by_property("qualified_name", method_qualified_name) if not node_json: print("can't find %r, creating" % method_qualified_name) method_node_id = self.code_element_kg_builder.add_method_node( method_qualified_name=method_qualified_name) else: method_node_id = node_json[GraphData.DEFAULT_KEY_NODE_ID] self.graph_data.add_relation(method_node_id, CodeEntityRelationCategory.to_str( CodeEntityRelationCategory.RELATION_CATEGORY_HAS_PARAMETER), parameter_node_id) print("end import parameter entity json") self.graph_data.print_graph_info() def import_method_local_variable_entity(self, entity_json_path): self.graph_data.print_graph_info() print("start import method local variable entity") with open(entity_json_path, "r", encoding='UTF-8') as f: code_list = json.load(f) record_num = len(code_list) print("load json complete entity size=%d" % record_num) for index, variable_infos in enumerate(code_list): method_qualified_name = variable_infos["method_name"] method_qualified_name = self.code_element_kg_builder.format_qualified_name(method_qualified_name) if not method_qualified_name: print("not __valid method name %r" % method_qualified_name) continue node_json = self.graph_data.find_one_node_by_property("qualified_name", method_qualified_name) if not node_json: print("can't find %r, creating" % method_qualified_name) method_node_id = self.code_element_kg_builder.add_method_node( method_qualified_name=method_qualified_name) else: method_node_id = node_json[GraphData.DEFAULT_KEY_NODE_ID] for variable in variable_infos["variable_model_list"]: variable_type = variable["type"] variable_name = variable["name"] variable_node_id = self.code_element_kg_builder.add_base_value_entity_node(value_type=variable_type, value_name=variable_name, short_description="", entity_category=CodeEntityCategory.CATEGORY_LOCAL_VARIABLE) if variable_node_id == GraphData.UNASSIGNED_NODE_ID: print("add variable node fail for %r" % variable) continue self.graph_data.add_relation(method_node_id, CodeEntityRelationCategory.to_str( CodeEntityRelationCategory.RELATION_CATEGORY_USE_LOCAL_VARIABLE), variable_node_id) print("end import local variable entity json") self.graph_data.print_graph_info() def import_return_value_entity(self, entity_json_path, entity_relation_json_path): print("start import return value entity") with open(entity_json_path, "r", encoding='UTF-8') as f: code_list = json.load(f) record_num = len(code_list) print("load json complete size=%d" % record_num) with open(entity_relation_json_path, "r", encoding='UTF-8') as f: relation_list = json.load(f) record_num = len(relation_list) print("load json complete entity relation size=%d" % record_num) old_id_to_new_node_id_map = {} for index, code_element in enumerate(code_list): return_value_id = code_element["id"] return_value_type = code_element["return_value_type"] return_value_name = "<R>" short_description = code_element["description"] return_value_node_id = self.code_element_kg_builder.add_base_value_entity_node(value_type=return_value_type, value_name=return_value_name, short_description=short_description, entity_category=CodeEntityCategory.CATEGORY_RETURN_VALUE) old_id_to_new_node_id_map[return_value_id] = return_value_node_id for r in relation_list: return_value_node_id = old_id_to_new_node_id_map[r["type_return_id"]] method_qualified_name = self.code_element_kg_builder.format_qualified_name(r["method_qualified_name"]) if not method_qualified_name: print("not __valid method name %r" % method_qualified_name) continue node_json = self.graph_data.find_one_node_by_property("qualified_name", method_qualified_name) if not node_json: print("can't find %r, creating" % method_qualified_name) method_node_id = self.code_element_kg_builder.add_method_node( method_qualified_name=method_qualified_name) else: method_node_id = node_json[GraphData.DEFAULT_KEY_NODE_ID] self.graph_data.add_relation(method_node_id, CodeEntityRelationCategory.to_str( CodeEntityRelationCategory.RELATION_CATEGORY_HAS_RETURN_VALUE), return_value_node_id) self.graph_data.print_graph_info() print("end import return value entity json") def import_thrown_exceptions(self, entity_json_path, entity_relation_json_path): print("start import thrown exceptions entity") print(self.graph_data) self.graph_data.print_label_count() with open(entity_json_path, "r", encoding='UTF-8') as f: code_list = json.load(f) record_num = len(code_list) print("load json complete size=%d" % record_num) with open(entity_relation_json_path, "r", encoding='UTF-8') as f: relation_list = json.load(f) record_num = len(relation_list) print("load json complete entity relation size=%d" % record_num) old_id_to_new_node_id_map = {} for index, code_element in enumerate(code_list): thrown_exception_id = code_element["id"] exception_type = code_element["exception_type"] exception_name = "<E>" short_description = code_element["description"] exception_condition_node_id = self.code_element_kg_builder.add_base_value_entity_node( value_type=exception_type, value_name=exception_name, short_description=short_description, entity_category=CodeEntityCategory.CATEGORY_EXCEPTION_CONDITION) old_id_to_new_node_id_map[thrown_exception_id] = exception_condition_node_id for r in relation_list: exception_condition_node_id = old_id_to_new_node_id_map[r["code_exception_id"]] method_qualified_name = self.code_element_kg_builder.format_qualified_name(r["method_qualified_name"]) if not method_qualified_name: print("not __valid method name %r" % method_qualified_name) continue node_json = self.graph_data.find_one_node_by_property("qualified_name", method_qualified_name) if not node_json: print("can't find %r, creating" % method_qualified_name) method_node_id = self.code_element_kg_builder.add_method_node( method_qualified_name=method_qualified_name) else: method_node_id = node_json[GraphData.DEFAULT_KEY_NODE_ID] self.graph_data.add_relation(method_node_id, CodeEntityRelationCategory.to_str( CodeEntityRelationCategory.RELATION_CATEGORY_HAS_EXCEPTION_CONDITION), exception_condition_node_id) self.graph_data.print_graph_info() print("end import thrown exceptions entity json") def infer_extra_relation(self): self.code_element_kg_builder.build_belong_to_relation() self.code_element_kg_builder.build_abstract_overloading_relation() # self.code_element_kg_builder.build_value_subclass_relation() self.code_element_kg_builder.build_belong_to_relation() self.code_element_kg_builder.build_override_relation() def add_source_label(self, source_label): self.code_element_kg_builder.add_source_label(source_label) def build_aliases(self): self.code_element_kg_builder.build_aliases_for_code_element() def save(self, graph_data_path): self.graph_data.save(graph_data_path) def save_as_simple_graph(self, output_path): graph_data = copy.deepcopy(self.graph_data) for node_id in graph_data.get_node_ids(): node_json = graph_data.get_node_info_dict(node_id=node_id) properties = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES] if "code" in properties: properties.pop("code") graph_data.save(output_path) def build_method_code_use_constant_field_relation(self): collection = self.export_code_document_collection() self.code_element_kg_builder.build_use_jdk_constant_field_relation_from_code_doc(collection) def export_code_document_collection(self, code_doc_collection_path=None): collection = self.code_element_kg_builder.export_code_document_collection(code_doc_collection_path) return collection
def __init__(self): self.graph_data = GraphData() self.text_extractor = EntityExtractor() self.detector = RelationDetector() self.identifier_info_extractor = IdentifierInfoExtractor()
from definitions import OUTPUT_DIR from pathlib import Path pro_name = 'jabref' dc_file_location = PathUtil.doc(pro_name=pro_name, version='v1') graph_data_file_location = PathUtil.graph_data(pro_name=pro_name, version='v1.8') dc_file_destination = PathUtil.doc(pro_name=pro_name, version='v1.1') comment_json_file = Path(OUTPUT_DIR) / "json" / "mid_2_dp_comment.json" qualified_name_json_file = Path( OUTPUT_DIR) / "json" / "mid_2_qualified_name.json" if __name__ == '__main__': doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( dc_file_location) graph_data: GraphData = GraphData.load(graph_data_file_location) comment_list = [] comments = open(comment_json_file, 'r').readlines() for line in comments: comment_list.append(json.loads(line)) qualified_name_list = [] names = open(qualified_name_json_file, 'r').readlines() for line in names: qualified_name_list.append(json.loads(line)) missing_count = 0 # 根据qualified name找到graph data对应节点的api_id, 然后通过这个api_id找到doc_collection中对应的doc, 插入field和相应信息 for item in qualified_name_list: qualified_name = item['qname']
def __init__(self, pro_name, version, model_dir): graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) self.graph_data: GraphData = GraphData.load(graph_data_path) self.model = self.create_search_model(pro_name, version, model_dir) print("It's ok for init!")
class DomainKGFusion: """ build the skeleton KG from the JavaParser analysis result for the Project Source Code. It will include the package, class, interface, method. """ STOPLIST = set(stopwords.words('english')) METHOD_LABELS = { CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_METHOD), CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_BASE_OVERRIDE_METHOD), } CLASS_LABELS = { CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_CLASS), CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_PACKAGE), CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_INTERFACE), CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_ENUM_CONSTANTS), } VARIABLE_LABELS = { CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_FIELD), CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_LOCAL_VARIABLE), CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_PARAMETER), CodeEntityCategory.to_str(CodeEntityCategory.CATEGORY_FIELD_OF_CLASS), } def __init__(self): self.graph_data = GraphData() self.text_extractor = EntityExtractor() self.detector = RelationDetector() self.identifier_info_extractor = IdentifierInfoExtractor() def init_graph_data(self, graph_data_path): self.graph_data = GraphData.load(graph_data_path) def add_code_relation(self, start_node_id, relation_name, code_element): name = code_element.split("<")[0].split(".")[-1].split(" ")[-1] if len(name) == 0: return node_json = self.graph_data.find_one_node_by_property(PropertyConstant.ALIAS, name) if node_json is None: return end_node_id = node_json[GraphData.DEFAULT_KEY_NODE_ID] self.graph_data.add_relation(startId=start_node_id, relationType=relation_name, endId=end_node_id) def handle_comment_in_class(self, node_id, node_properties): terms = set() linkages = set() comment = node_properties.get(PropertyConstant.COMMENT, "") domain_terms, code_elements = self.text_extractor.extract_from_comment(comment) for term in domain_terms: terms.add(term) linkages.add((node_id, RelationType.MENTION_IN_COMENT.value, term)) return terms, linkages def handle_text_in_method(self, node_id, node_properties): terms = set() linkages = set() comment = node_properties.get(PropertyConstant.COMMENT, "") domain_terms, code_elements = self.text_extractor.extract_from_comment(comment) for term in domain_terms: terms.add(term) linkages.add((node_id, RelationType.MENTION_IN_COMENT.value, term)) for element in code_elements: self.add_code_relation(node_id, RelationType.MENTION_IN_COMENT.value, element) for inside_comment in node_properties.get(PropertyConstant.INSIDE_COMMENT, []): domain_terms, code_elements = self.text_extractor.extract_from_sentence(inside_comment) for term in domain_terms: terms.add(term) linkages.add((node_id, RelationType.MENTION_IN_INSIDE_COMENT.value, term)) for element in code_elements: self.add_code_relation(node_id, RelationType.MENTION_IN_INSIDE_COMENT.value, element) for literal_expr in node_properties.get(PropertyConstant.STRING_LITERAL_EXPR, []): domain_terms, code_elements = self.text_extractor.extract_from_comment(literal_expr) for term in domain_terms: terms.add(term) linkages.add((node_id, RelationType.MENTION_IN_STRING_LITERAL.value, term)) for element in code_elements: self.add_code_relation(node_id, RelationType.MENTION_IN_STRING_LITERAL.value, element) return terms, linkages def handle_description(self, node_id, description): terms = set() linkages = set() domain_terms, code_elements = self.text_extractor.extract_from_sentence(description) for term in domain_terms: linkages.add((node_id, RelationType.MENTION_IN_SHORT_DESCRIPTION.value, term)) for element in code_elements: self.add_code_relation(node_id, RelationType.MENTION_IN_SHORT_DESCRIPTION.value, element) return terms, linkages def handle_method_name(self, node_id, name): terms, operations, relations, linkages = self.identifier_info_extractor.extract_from_method_name( name, mark_for_identifier_in_relation=node_id) belong_to_relations = self.graph_data.get_relations(node_id, CodeEntityRelationCategory.to_str( CodeEntityRelationCategory.RELATION_CATEGORY_BELONG_TO)) if len(belong_to_relations) > 0: class_id = belong_to_relations.pop()[2] for op in operations: linkages.add((class_id, RelationType.HAS_OPERATION.value, op)) return terms, operations, relations, linkages def handle_class_name(self, node_id, name): terms, relations, linkages = self.identifier_info_extractor.extract_from_class_name(name, mark_for_identifier_in_relation=node_id) return terms, relations, linkages def handle_variable_name(self, node_id, name): terms, relations, linkages = self.identifier_info_extractor.extract_from_variable(name, mark_for_identifier_in_relation=node_id) return terms, relations, linkages def extract_term_and_relation(self, term_save_path=None, operation_save_path=None, term_relation_save_path=None, linkage_save_path=None, term_aliases_save_path=None, not_fused_term_save_path=None): print("start extract term and relation from graph data") self.graph_data.print_graph_info() # cache the map for adding relation not_fused_terms = set() operations = set() relations = set() linkages = set() i = 0 for node_id in list(self.graph_data.get_node_ids()): try: i = i + 1 if (i % 100) == 0: print("已经执行了%d次节点检索" % i) # else: # i = i + 1 node_json = self.graph_data.get_node_info_dict(node_id=node_id) if not node_json: continue node_properties = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES] node_labels = node_json[GraphData.DEFAULT_KEY_NODE_LABELS] # print(len(node_labels & self.METHOD_LABELS)) if 'sentence' in node_labels: terms_, linkages_ = self.handle_description(node_id, node_properties["sentence_name"]) not_fused_terms.update(terms_) linkages.update(linkages_) continue if len(node_labels & self.METHOD_LABELS) > 0: terms_, linkages_ = self.handle_text_in_method(node_id, node_properties) not_fused_terms.update(terms_) linkages.update(linkages_) terms_, operations_, relations_, linkages_ = self.handle_method_name(node_id, node_properties[ GraphData.DEFAULT_KEY_PROPERTY_QUALIFIED_NAME]) not_fused_terms.update(terms_) operations.update(operations_) relations.update(relations_) linkages.update(linkages_) description = node_properties.get(PropertyConstant.DESCRIPTION, "") if description is not None and len(description) > 0: terms_, linkages_ = self.handle_description(node_id, node_properties[PropertyConstant.DESCRIPTION]) not_fused_terms.update(terms_) linkages.update(linkages_) if len(node_labels & self.CLASS_LABELS) > 0: terms_, linkages_ = self.handle_comment_in_class(node_id, node_properties) not_fused_terms.update(terms_) linkages.update(linkages_) terms_, relations_, linkages_ = self.handle_class_name(node_id, node_properties[ GraphData.DEFAULT_KEY_PROPERTY_QUALIFIED_NAME]) not_fused_terms.update(terms_) linkages.update(linkages_) relations.update(relations_) if len(node_labels & self.VARIABLE_LABELS) > 0: terms_, linkages_ = self.handle_comment_in_class(node_id, node_properties) not_fused_terms.update(terms_) linkages.update(linkages_) terms_, relations_, linkages_ = self.handle_variable_name(node_id, node_properties[ GraphData.DEFAULT_KEY_PROPERTY_QUALIFIED_NAME]) not_fused_terms.update(terms_) linkages.update(linkages_) relations.update(relations_) except: traceback.print_exc() new_terms = [] for term in not_fused_terms: if self.valid_term(term): new_terms.append(term) not_fused_terms = set(new_terms) relations_ = self.detector.detect_relation_by_starfix(not_fused_terms) relations.update(relations_) print("complete domain extraction") term_fusion = Fusion() synsets = term_fusion.fuse_by_synonym(not_fused_terms) print("complete synonym fusion") fused_term_to_aliases_map = {} for synset in synsets: fused_term_to_aliases_map[synset.key] = list(synset.terms) fused_terms = fused_term_to_aliases_map.keys() new_relations = set() new_linkages = set() for start_e, relation_name, end_e in relations: if relation_name == "has operation": continue if relation_name == "can be operated": continue new_start_e_list = set() new_end_e_list = set() for fused_term, aliases in fused_term_to_aliases_map.items(): if end_e in aliases: new_end_e_list.add(fused_term) if start_e in aliases: new_start_e_list.add(fused_term) for new_start_e in new_start_e_list: for new_end_e in new_end_e_list: new_relations.add((new_start_e, relation_name, new_end_e)) if len(new_start_e_list) == 0: new_start_e_list.add(start_e) if len(new_end_e_list) == 0: new_end_e_list.add(end_e) relations = new_relations for start_e, relation_name, end_e in linkages: if relation_name == "has operation": continue if relation_name == "can be operated": continue new_start_e_list = set() new_end_e_list = set() for fused_term, aliases in fused_term_to_aliases_map.items(): if end_e in aliases: new_end_e_list.add(fused_term) if start_e in aliases: new_start_e_list.add(fused_term) if len(new_start_e_list) == 0: new_start_e_list.add(start_e) if len(new_end_e_list) == 0: new_end_e_list.add(end_e) for new_start_e in new_start_e_list: for new_end_e in new_end_e_list: new_linkages.add((new_start_e, relation_name, new_end_e)) linkages = new_linkages print("length of new_linkages %d" % (len(linkages))) # term_orgin = {k: list(v) for k, v in term_orgin.items()} import json if term_save_path is not None: with Path(term_save_path).open("w") as f: f.write("\n".join(sorted(fused_terms, key=lambda x: x))) if not_fused_term_save_path is not None: with Path(not_fused_term_save_path).open("w") as f: f.write("\n".join(sorted(not_fused_terms, key=lambda x: x))) if operation_save_path is not None: with Path(operation_save_path).open("w") as f: f.write("\n".join(sorted(operations, key=lambda x: x))) if term_relation_save_path is not None: with Path(term_relation_save_path).open("w") as f: json.dump( [(r[0], str(r[1]), r[2]) for r in relations if self.valid_term(r[0]) and self.valid_term(r[2])], f, indent=4) if linkage_save_path is not None: with Path(linkage_save_path).open("w") as f: json.dump([(r[0], str(r[1]), r[2]) for r in linkages if self.valid_term(r[0]) or self.valid_term(r[2])], f, indent=4) if term_aliases_save_path is not None: with Path(term_aliases_save_path).open("w") as f: json.dump(fused_term_to_aliases_map, f, indent=4) return fused_terms, operations, relations, linkages, fused_term_to_aliases_map def select_name(self, terms): return min(terms, key=lambda x: len(x)) def valid_term(self, term): term = str(term) if len(term) <= 2 or term.isdigit() or (len(term) > 30 and len(term.split()) > 4): return False prefix, *rest = term.split() if prefix in self.STOPLIST: return False return True def add_domain_term(self, term, lemma, aliases): """ add a new term to graph data :param term: the term added to GraphData :return: the node_id fo the added term node """ if aliases == None: aliases = set([]) else: aliases = set(list(aliases)) aliases.add(lemma) aliases.add(term) node_labels = [DomainConstant.LABEL_DOMAIN_TERM] node_properties = { DomainConstant.PRIMARY_PROPERTY_NAME: term, PropertyConstant.ALIAS: aliases, PropertyConstant.LEMMA: lemma } domain_term_node_id = self.graph_data.add_node(node_labels=node_labels, node_properties=node_properties, primary_property_name=DomainConstant.PRIMARY_PROPERTY_NAME) return domain_term_node_id def add_operation(self, op, lemma): node_labels = [OperationConstance.LABEL_OPERATION] node_properties = { OperationConstance.PRIMARY_PROPERTY_NAME: op, PropertyConstant.ALIAS: {op}, PropertyConstant.LEMMA: lemma } operation_node_id = self.graph_data.add_node(node_labels=node_labels, node_properties=node_properties, primary_property_name=OperationConstance.PRIMARY_PROPERTY_NAME) return operation_node_id def update_domain_node_alias(self, node_id, term): node_json = self.graph_data.get_node_info_dict(node_id=node_id) if not node_json: return node_properties = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES] alias = node_properties[PropertyConstant.ALIAS] alias.add(term) name = self.select_name(alias) node_json[DomainConstant.PRIMARY_PROPERTY_NAME] = name self.graph_data.update_node_index(node_id=node_id) def update_operation_node_alias(self, node_id, term): node_json = self.graph_data.get_node_info_dict(node_id=node_id) if not node_json: return node_properties = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES] alias = node_properties[PropertyConstant.ALIAS] alias.add(term) name = self.select_name(alias) node_json[OperationConstance.PRIMARY_PROPERTY_NAME] = name self.graph_data.update_node_index(node_id=node_id) def add_relation_for_same_name_operation_and_domain_term(self): operation_node_ids = self.graph_data.get_node_ids_by_label(OperationConstance.LABEL_OPERATION) for operation_id in operation_node_ids: operation_node = self.graph_data.get_node_info_dict(operation_id) operation_name = operation_node[GraphData.DEFAULT_KEY_NODE_PROPERTIES][ OperationConstance.PRIMARY_PROPERTY_NAME] domain_term_node = self.graph_data.find_one_node_by_property( property_name=DomainConstant.PRIMARY_PROPERTY_NAME, property_value=operation_name) if domain_term_node == None: continue domain_term_node_id = domain_term_node[GraphData.DEFAULT_KEY_NODE_ID] self.graph_data.add_relation(operation_id, "corresponding concept", domain_term_node_id) self.graph_data.add_relation(domain_term_node_id, "corresponding operation", operation_id) def save(self, graph_data_path): self.graph_data.save(graph_data_path) def fuse(self, terms, operations, relations, linkages, aliases_map): """ start import the term and their relation to graph :param term_origins: :param term_relations: :return: """ # domain_graph_data = GraphData() self.graph_data.create_index_on_property(DomainConstant.PRIMARY_PROPERTY_NAME) self.graph_data.create_index_on_property(OperationConstance.PRIMARY_PROPERTY_NAME) self.graph_data.create_index_on_property(PropertyConstant.ALIAS) # todo:update the index when add print("start fuse with domain knowledge") self.graph_data.print_graph_info() term_lemma2id = {} term_name2id = {} op_lemma2id = {} op_name2id = {} def __add_or_update(name, is_op=False): if is_op: lemma = name.lower() if lemma in op_lemma2id: node_id = op_lemma2id[lemma] self.update_operation_node_alias(node_id, name) else: node_id = self.add_operation(name, lemma) if node_id == GraphData.UNASSIGNED_NODE_ID: print("adding operation %r fail" % name) return node_id op_lemma2id[lemma] = node_id op_name2id[name] = node_id else: lemma = name.replace("-", " ").replace(" ", " ").lower() lemma = re.sub('([^v])([0-9]+)', r'\1 \2', lemma) node_id = self.add_domain_term(name, lemma, aliases=aliases_map.get(name, None)) if node_id == GraphData.UNASSIGNED_NODE_ID: print("adding domain term %r fail" % name) return node_id term_lemma2id[lemma] = node_id term_name2id[name] = node_id return node_id for term in sorted(terms, key=lambda x: len(x.split())): __add_or_update(term) for op in operations: __add_or_update(op, is_op=True) def __add_relation(start_term, relation_name, end_term): start_name2id = term_name2id end_name2id = term_name2id start_term_is_op = False end_term_is_op = False # if relation_name.startswith("operation_"): # start_name2id = op_name2id # start_term_is_op = True if relation_name == "has operation": end_name2id = op_name2id end_term_is_op = True if relation_name == "instance of": end_name2id = op_name2id end_term_is_op = True if relation_name == "can be operated": end_name2id = op_name2id end_term_is_op = True if type(start_term) == int: start_node_id = start_term else: if start_term in start_name2id: start_node_id = start_name2id[start_term] else: start_node_id = __add_or_update(start_term, is_op=start_term_is_op) if start_node_id == GraphData.UNASSIGNED_NODE_ID: print("adding start_domain term %r fail for relation %r" % ( start_term, (start_term, relation_name, end_term))) return if type(end_term) == int: end_node_id = end_term else: if end_term in end_name2id: end_node_id = end_name2id[end_term] else: end_node_id = __add_or_update(end_term, is_op=end_term_is_op) if end_node_id == GraphData.UNASSIGNED_NODE_ID: print("adding start_domain term %r fail for relation %r" % ( start_term, (start_term, relation_name, end_term))) return self.graph_data.add_relation(startId=start_node_id, relationType=relation_name, endId=end_node_id) for (start_term, relation_name, end_term) in relations: __add_relation(start_term, relation_name, end_term) for (start_term, relation_name, end_term) in linkages: __add_relation(start_term, relation_name, end_term) isA_relations = set() for (start_id, _, end_id) in self.graph_data.get_relations( relation_type=CodeEntityRelationCategory.to_str(CodeEntityRelationCategory.RELATION_CATEGORY_EXTENDS)): start_domain_ids = {e for _, _, e in self.graph_data.get_relations(start_id=start_id, relation_type=RelationType.REPRESENT.value)} end_domain_ids = {e for _, _, e in self.graph_data.get_relations(start_id=start_id, relation_type=RelationType.REPRESENT.value)} for s in start_domain_ids: for e in end_domain_ids: isA_relations.add((s, RelationType.IS_A.value, e)) for r in isA_relations: __add_relation(*r) self.add_relation_for_same_name_operation_and_domain_term() print("end fuse with domain knowledge") self.graph_data.refresh_indexer() self.graph_data.print_graph_info() def build_aliases_for_domain_term_and_operations(self, new_all_aliases_save_path=None): name_util = ConceptElementNameUtil() domain_term_ids = self.graph_data.get_node_ids_by_label(DomainConstant.LABEL_DOMAIN_TERM) term_name_list = [] fused_term_to_aliases_map = {} for domain_term_id in domain_term_ids: node_json = self.graph_data.get_node_info_dict(domain_term_id) term_name = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES][DomainConstant.PRIMARY_PROPERTY_NAME] term_name_list.append(term_name) for domain_term_id in domain_term_ids: node_json = self.graph_data.get_node_info_dict(domain_term_id) term_name = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES][DomainConstant.PRIMARY_PROPERTY_NAME] all_aliases_list = set([]) generated_aliases = name_util.generate_aliases(term_name, vocabulary=term_name_list) all_aliases_list = all_aliases_list | set(generated_aliases) exist_aliases = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES].get(PropertyConstant.ALIAS, set([])) for alias in exist_aliases: all_aliases_list.add(alias) generated_aliases = name_util.generate_aliases(term_name, vocabulary=term_name_list) all_aliases_list = all_aliases_list | set(generated_aliases) node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES][PropertyConstant.ALIAS] = all_aliases_list fused_term_to_aliases_map[term_name] = list(all_aliases_list) operation_ids = self.graph_data.get_node_ids_by_label(OperationConstance.LABEL_OPERATION) # todo: build the relation between operation and domain term for operation_id in operation_ids: node_json = self.graph_data.get_node_info_dict(operation_id) term_name = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES][OperationConstance.PRIMARY_PROPERTY_NAME] synsets = wn.synsets(term_name, pos="v") generated_aliases = [synset.name().split(".")[0] for synset in synsets] exist_aliases = node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES].get(PropertyConstant.ALIAS, set([])) for alias in generated_aliases: exist_aliases.add(alias) node_json[GraphData.DEFAULT_KEY_NODE_PROPERTIES][PropertyConstant.ALIAS] = exist_aliases if new_all_aliases_save_path != None: with Path(new_all_aliases_save_path).open("w") as f: json.dump(fused_term_to_aliases_map, f, indent=4) # todo: build the aliases for operation self.graph_data.refresh_indexer() def delete_islocated_nodes_by_label(self, label): domain_node_ids = self.graph_data.get_node_ids_by_label(label) remove_ids = [] for domain_id in domain_node_ids: out_ids = self.graph_data.get_all_out_relations(domain_id) in_ids = self.graph_data.get_all_in_relations(domain_id) if not out_ids and not in_ids: remove_ids.append(domain_id) print("delete %d islocated domain term" % (len(remove_ids))) for id in remove_ids: # print("remove islocated node:", self.graph_data.get_node_info_dict(id)) self.graph_data.remove_node(id) return self.graph_data def delete_nodes_and_relations(self, name_list): for name in name_list: node_info = self.graph_data.find_one_node_by_property(DomainConstant.PRIMARY_PROPERTY_NAME, name) if node_info: node_id = node_info["id"] out_relations = self.graph_data.get_all_out_relations(node_id) in_relations = self.graph_data.get_all_in_relations(node_id) for s, r, e in out_relations.union(in_relations): # print('delete relation %d, %s, %d' % (s, r, e)) self.graph_data.remove_relation(s, r, e) self.graph_data.remove_node(node_id) # print("delete node %d" % (node_id)) else: print("can't find node for %s" % (name)) return self.graph_data