Ejemplo n.º 1
0
    def init(self, path="word2vec_api_software_wiki.txt", binary=True):
        self.session = EngineFactory.create_session()
        self.graphClient = DefaultGraphAccessor(GraphClient(server_number=4))

        self.entity_vector_model = EntityVectorComputeModel()
        self.entity_vector_model.init_word2vec_model(path=path, binary=binary)
        print("init complete")
Ejemplo n.º 2
0
    def get_all_nodes(self, step=5000, labels=None):
        total_node_list = []
        self.graph_accessor = DefaultGraphAccessor(self._graph)
        if labels is None:
            labels = []

        if labels:
            max_id = self.graph_accessor.get_max_id_for_labels(*labels)
            min_id = self.graph_accessor.get_min_id_for_labels(*labels)
        else:
            max_id = self.graph_accessor.get_max_id_for_node()
            min_id = 0

        iteration = range(min_id, max_id, step)
        for start_id in iteration:
            try:
                end_id = min(start_id + step, max_id)
                nodes_in_scope = self.get_nodes_in_scope(start_id, end_id, labels)
                _logger.info("start id=%s,end_id=%s", str(start_id), str(end_id))
                if nodes_in_scope is not None:
                    _logger.info("get nodes in scope successfully")
                    total_node_list.extend(nodes_in_scope)
                else:
                    _logger.info("get nodes in scope failed")
            except Exception, error:
                _logger.exception("")
    def test_expand_nodes_with_filter_nodes(self):
        graphClient = DefaultGraphAccessor(GraphClient())

        # test_case = [(55730, True, 50, 50), (15555, True, 4, 4), (93008, True, 10, 11), (1708, True, 8, 7)]
        test_case = [
            (55730, True, 50, 50),
        ]
        graphJsonParser = GraphJsonParser()
        for node_id, is_valid, node_num, relation_num in test_case:
            print("test case=", node_id, is_valid, node_num, relation_num)
            subgraph = graphClient.expand_node_for_adjacent_nodes_to_subgraph(
                node_id)
            subgraph_json = graphJsonParser.parse_subgraph_to_public_json(
                subgraph)
            print(subgraph_json)
            if is_valid:
                self.assertNotEqual(subgraph_json, {
                    "nodes": [],
                    "relations": []
                })
            else:
                self.assertEqual(subgraph_json, {"nodes": [], "relations": []})
                continue
            self.assertEqual(node_num, len(subgraph_json["nodes"]))
            self.assertEqual(relation_num, len(subgraph_json["relations"]))

            for n in subgraph_json["nodes"]:
                print(n)
            for r in subgraph_json["relations"]:
                print(r)
 def __init__(self):
     self.question_preprossor = QuestionPreprossor()
     self.question_analyzer = QuestionAnalyzer()
     self.candidate_answer_generator = CandidateAnswerSetGenerator()
     # self.answer_generator = AnswerGenerator()
     self.answer_generator = None
     self.client = DefaultGraphAccessor(GraphClient())
class TestNodeCleaner(TestCase):
    def test_clean_labels(self):
        self.graphClient = DefaultGraphAccessor(GraphClient(server_number=1))

        node = self.graphClient.find_node_by_id(16)
        self.assertEqual(NodeCleaner.clean_labels(node), [u'software', u'background knowledge', u'WikiData'])
        node = self.graphClient.find_node_by_id(177777)
        print node
        self.assertEqual(NodeCleaner.clean_labels(node), [u'background knowledge', u'WikiData'])

    def test_construct_property_set(self):
        node_list = [{"aaa": 1, "ccc": 6}, {"aaa": 2, "bbb": 3, "ccc": 6}, {"ccc": 4, "ddd": 5}]
        result = {"aaa", "ccc"}
        self.assertEqual(construct_property_set(node_list), result)
        print construct_property_set(node_list)

    def test_rename_property(self):
        self.graphClient = DefaultGraphAccessor(GraphClient(server_number=0))
        node_list = self.graphClient.find_by_name_property("awesome item", "acl9")
        result = rename_property(node_list)
        print result

    def test_public_labels_name(self):
        t = PUBLIC_LABELS
        self.assertIsNotNone(t)
        print(t)
    def test_clean_labels(self):
        self.graphClient = DefaultGraphAccessor(GraphClient(server_number=1))

        node = self.graphClient.find_node_by_id(16)
        self.assertEqual(NodeCleaner.clean_labels(node), [u'software', u'background knowledge', u'WikiData'])
        node = self.graphClient.find_node_by_id(177777)
        print node
        self.assertEqual(NodeCleaner.clean_labels(node), [u'background knowledge', u'WikiData'])
    def test_sort_nodes_by_quality(self):
        graphClient = DefaultGraphAccessor(GraphClient(server_number=1))
        graphJson = GraphJsonParser()
        keyword = "java"
        top_number = 10
        subgraph = graphClient.search_nodes_by_name_in_subgraph(
            keyword, top_number)
        print subgraph
        nodes = graphJson.parse_nodes_in_subgraph_to_public_json(subgraph)

        print nodes
    def init(self, vector_dir_path="./model/"):
        self.kg_models = KnowledgeGraphFeafureModels()
        self.kg_models.init(vector_dir_path=vector_dir_path)

        self._session = EngineFactory.create_session(echo=False)
        self._entity_extractor = EntityExtractor()

        # self._tf_idf_model = TFIDFModel()
        # self._tf_idf_model.load(dict_type=2)

        self.qa_searcher = QAEntitySearcher()
        client = GraphClient(server_number=4)
        self.semanticSearchAccessor = SemanticSearchAccessor(client)
        self.defaultAccessor = DefaultGraphAccessor(client)
        self._logger = Logger("QAResultSearch").get_log()
    def build_aliases_for_domain_entity(self):

        EntityForQA.delete_names_by_source(session=self.session, source="domain entity")

        client = GraphClient(server_number=4)
        accessor = DomainEntityAccessor(client)
        default_accessor = DefaultGraphAccessor(client)
        domain_entity_list = accessor.get_all_domain_entity()
        for domain_entity in domain_entity_list:
            entity = EntityForQA(kg_id=default_accessor.get_id_for_node(node=domain_entity),
                                 entity_id=domain_entity['domain_entity_id'], source="domain entity",
                                 attr='domain_entity_id', attr_value=domain_entity['domain_entity:name'])

            self.session.add(entity)
        self.session.commit()
Ejemplo n.º 10
0
class NodeCollection:

    def __init__(self, graph):
        '''
        init with a Graph object
        :param graph: ::GraphAccessor, ::GraphClient,::Graph
        '''
        if isinstance(graph, GraphAccessor):
            self._graph = graph.graph
        elif isinstance(graph, GraphClient):
            self._graph = graph.graph
        elif isinstance(graph, Graph):
            self._graph = graph
        else:
            self._graph = None
        self.graph_accessor = None

    def get_all_nodes(self, step=5000, labels=None):
        total_node_list = []
        self.graph_accessor = DefaultGraphAccessor(self._graph)
        if labels is None:
            labels = []

        if labels:
            max_id = self.graph_accessor.get_max_id_for_labels(*labels)
            min_id = self.graph_accessor.get_min_id_for_labels(*labels)
        else:
            max_id = self.graph_accessor.get_max_id_for_node()
            min_id = 0

        iteration = range(min_id, max_id, step)
        for start_id in iteration:
            try:
                end_id = min(start_id + step, max_id)
                nodes_in_scope = self.get_nodes_in_scope(start_id, end_id, labels)
                _logger.info("start id=%s,end_id=%s", str(start_id), str(end_id))
                if nodes_in_scope is not None:
                    _logger.info("get nodes in scope successfully")
                    total_node_list.extend(nodes_in_scope)
                else:
                    _logger.info("get nodes in scope failed")
            except Exception, error:
                _logger.exception("")
        return total_node_list
Ejemplo n.º 11
0
 def walk(self):
     '''
     '''
     file = open('log1.txt', 'w')
     client = DefaultGraphAccessor()
     iterator = 0
     possibility_list = [0] * RandomWalk.max_id
     adjacent_node_list = [0] * RandomWalk.max_id
     file.writelines("begin:  " + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + '\n')
     file.flush()
     while iterator < self.generationNumber:
         file.writelines("iteration:   " + str(iterator) + "   " + time.strftime('%Y-%m-%d %H:%M:%S',
                                                                                 time.localtime(time.time())) + '\n')
         file.flush()
         # print str(iterator) + "   " + time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
         current_node_index = 1
         step_length_list = [0] * RandomWalk.max_id
         for i in range(0, self.stepNumber):
             if (current_node_index > 0 and current_node_index < 7520) or current_node_index > 7524:
                 if adjacent_node_list[current_node_index - 1] == 0:
                     adjacent_node = client.get_adjacent_node_id_list(current_node_index)
                     adjacent_node_list[current_node_index - 1] = adjacent_node
                 else:
                     adjacent_node = adjacent_node_list[current_node_index - 1]
                 next_node_index = adjacent_node[random.randint(0, len(adjacent_node) - 1)]
                 # print str(iterator) + "   " + str(i) + "   " + str(len(adjacent_node)) + "    " + str(
                 # next_node_index)
                 if (next_node_index > 1 and next_node_index < 7520) or next_node_index > 7524:
                     current_node_index = next_node_index
                 if step_length_list[next_node_index - 1] == 0 and next_node_index != 1:
                     step_length_list[next_node_index - 1] = i + 1
                     possibility_list[next_node_index - 1] += 1 - (
                     step_length_list[next_node_index - 1] / float(self.stepNumber))
         iterator += 1
     file.writelines("end:  " + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + '\n')
     file.flush()
     file.close()
     for i in range(0, RandomWalk.max_id):
         possibility_list[i] /= self.generationNumber
         if possibility_list[i] > 0:
             print str(i) + "  " + str(possibility_list[i])
Ejemplo n.º 12
0
    def init(self, vector_dir_path="./model/", ):
        time_start = time.time()
        print("start init the model=%d" % time_start)
        client = GraphClient(server_number=4)

        self.defaultAccessor = DefaultGraphAccessor(client)

        self._api_wv = EntityVectorModel.load(vector_dir_path + self.WORD2VEC_FILE_LIST["api"], binary=True)
        self._domain_entity_wv = EntityVectorModel.load(vector_dir_path + self.WORD2VEC_FILE_LIST["domain entity"],
                                                        binary=True)
        self._wiki_wv = EntityVectorModel.load(vector_dir_path + self.WORD2VEC_FILE_LIST["wikidata"], binary=True)
        self._sentence_wv = EntityVectorModel.load(vector_dir_path + self.WORD2VEC_FILE_LIST["sentence"], binary=True)
        self._graph_wv = EntityVectorModel.load(vector_dir_path + self.WORD2VEC_FILE_LIST["graph"], binary=True)
        self._entity_vector_compute_model = EntityVectorComputeModel()
        self._entity_vector_compute_model.init_word2vec_model(vector_dir_path + self.WORD2VEC_FILE_LIST["word2vec"],
                                                              binary=True)
        self.NP_VECTOR_NOT_EXIST = np.zeros(128)
        self.NP_VECTOR_NOT_EXIST[1] = 1e-07

        time_end = time.time()
        print("init complete in %d" % (time_end - time_start))
Ejemplo n.º 13
0
    def start_import_for_api_entity(self, linking_result_file):
        graph_client = GraphClient(server_number=4)
        default_graph_client = DefaultGraphAccessor(graph_client)
        api_entity_graph_client = DomainEntityAccessor(graph_client)
        api_entity_graph_client.delete_all_api_entity_to_wikipedia_relation()
        print("delete all old may link relation complete")

        with open(linking_result_file, 'r') as f:
            link_relation_list = json.load(f)

        for each in link_relation_list:
            api_entity_id = each['api_entity_id']
            wikipedia_entity_id = each['wikipedia_entity_id']
            if api_entity_id is None or wikipedia_entity_id is None:
                continue
            api_entity = api_entity_graph_client.find_api_entity_node_by_id(api_entity_id)
            if api_entity is None:
                continue
            wikipedia_entity = default_graph_client.find_node_by_id(wikipedia_entity_id)
            if wikipedia_entity is None:
                continue
            api_entity_graph_client.create_entity_to_general_concept_relation(api_entity, wikipedia_entity)
Ejemplo n.º 14
0
from skgraph.graph.accessor.factory import NodeBuilder
# Mysql connect
from skgraph.graph.accessor.graph_accessor import DefaultGraphAccessor, GraphClient
from skgraph.util.code_text_process import clean_html_text

conn = MySQLdb.connect(host='10.141.221.73',
                       port=3306,
                       user='******',
                       passwd='root',
                       db='fdroid',
                       charset="utf8")
cur = conn.cursor()

# neo4j connect
connect_graph = DefaultGraphAccessor(GraphClient(server_number=1)).graph


# read from mysql
def mySQLReader(start, end):
    node1lib = connect_graph.find_one(label="schema",
                                      property_key="wd_item_id",
                                      property_value="Q21127166")

    # get all-version library
    cur.execute(
        "select * from jdk_library where library_id >= %s and library_id < %s",
        (start, end))
    lib_sql_data_list = cur.fetchall()

    for lib_node_mysql_data_index in range(0, len(lib_sql_data_list)):
Ejemplo n.º 15
0
from py2neo import Node

from skgraph.graph.accessor.graph_accessor import DefaultGraphAccessor, GraphClient
from skgraph.graph.operation.generateDataPpi import DataCreateUtil
from skgraph.graph.operation.random_walk_restart import Walker

log_file = open('log.txt', 'w')
input_file_name = 'data.ppi'
max_id = 1665370
client_1 = DefaultGraphAccessor(GraphClient(server_number=1))
client_2 = DefaultGraphAccessor(GraphClient(server_number=2))
data_util = DataCreateUtil(client_1)
for id in range(23, max_id + 1):
    log_file.writelines('begin  %d\n' % (id))
    if data_util.createData(id) == 1:
        log_file.writelines('generate ppi end\n')
        walker = Walker(input_file_name, client_2)
        walker.run_exp(id, 0.15, log_file)
        log_file.flush()
    else:
        node_1 = Node(link_id=id)
        client_2.merge(node_1)
        log_file.writelines('add node end\n')
        log_file.flush()
log_file.close()
class SentenceLevelSemanticSearch:
    SORT_FUNCTION_ENTITIES_BRIDGE = 3
    SORT_FUNCTION_AVERAGE_ENTITY_GRAPH_SIMILAR = 4

    SORT_FUNCTION_AVERAGE_VECTOR = 2

    SORT_FUNCTION_NOT_AVERAGE_GRAPH_VECTOR = 1
    SORT_FUNCTION_SELECT_PART_ENTITY_LINK = 5

    def __init__(self, ):
        self._session = None
        self.kg_models = None
        self._entity_extractor = None

        # self._tf_idf_model = None

        self.qa_searcher = None
        self.semanticSearchAccessor = None
        self.defaultAccessor = None
        self._logger = None

    def init(self, vector_dir_path="./model/"):
        self.kg_models = KnowledgeGraphFeafureModels()
        self.kg_models.init(vector_dir_path=vector_dir_path)

        self._session = EngineFactory.create_session(echo=False)
        self._entity_extractor = EntityExtractor()

        # self._tf_idf_model = TFIDFModel()
        # self._tf_idf_model.load(dict_type=2)

        self.qa_searcher = QAEntitySearcher()
        client = GraphClient(server_number=4)
        self.semanticSearchAccessor = SemanticSearchAccessor(client)
        self.defaultAccessor = DefaultGraphAccessor(client)
        self._logger = Logger("QAResultSearch").get_log()

    def semantic_search(self,
                        query_text,
                        each_np_candidate_entity_num=50,
                        sort_function=SORT_FUNCTION_SELECT_PART_ENTITY_LINK,
                        sentence_limit=20,
                        weight_context_sim=0.6,
                        weight_graph_sim=0.4,

                        ):
        try:
            qa_info_manager = self.get_candidate_sentences(query_text,
                                                           each_np_candidate_entity_num=each_np_candidate_entity_num)

            # sentence_list=qa_info_manager.get_candidate_sentence_list()
            #
            # entity_for_qa_set
            # entity_for_qa_set.print_informat()
            # entity_list = entity_for_qa_set.get_entity_node_list()
            # chunk_to_related_entity_list_map = entity_for_qa_set.keyword_2_entitynodemap

            self._logger.info("entity_list =%d sentence_list=%d" % (
                qa_info_manager.get_entity_size(), qa_info_manager.get_sentence_size()))
            # for n in entity_list:
            #     print("entity", n)
            new_sentence_list = []
            # if sort_function == SentenceLevelSemanticSearch.SORT_FUNCTION_NOT_AVERAGE_GRAPH_VECTOR:
            #     new_sentence_list = self.sort_sentence_by_build_graph_vector_for_query_in_semantic_weight(query_text,
            #                                                                                               sentence_list=sentence_list,
            #                                                                                               entity_list=entity_list,
            #                                                                                               weight_context_sim=weight_context_sim,
            #                                                                                               weight_graph_sim=weight_graph_sim)
            #
            # if sort_function == SentenceLevelSemanticSearch.SORT_FUNCTION_AVERAGE_VECTOR:
            #     new_sentence_list = self.sort_sentence_by_build_average_graph_vector_for_query(query_text,
            #                                                                                    sentence_list=sentence_list,
            #                                                                                    entity_list=entity_list,
            #                                                                                    weight_context_sim=weight_context_sim,
            #                                                                                    weight_graph_sim=weight_graph_sim
            #                                                                                    )
            #
            # if sort_function == SentenceLevelSemanticSearch.SORT_FUNCTION_ENTITIES_BRIDGE:
            #     new_sentence_list = self.sort_sentence_by_entities_as_bridge(query_text,
            #                                                                  sentence_list=sentence_list,
            #                                                                  entity_list=entity_list,
            #                                                                  weight_context_sim=weight_context_sim,
            #                                                                  weight_graph_sim=weight_graph_sim)
            #
            # if sort_function == SentenceLevelSemanticSearch.SORT_FUNCTION_AVERAGE_ENTITY_GRAPH_SIMILAR:
            #     new_sentence_list = self.sort_sentence_by_entities_for_graph_similarity_as_bridge(query_text,
            #                                                                                       sentence_list=sentence_list,
            #                                                                                       entity_list=entity_list,
            #                                                                                       weight_context_sim=weight_context_sim,
            #                                                                                       weight_graph_sim=weight_graph_sim)

            if sort_function == SentenceLevelSemanticSearch.SORT_FUNCTION_SELECT_PART_ENTITY_LINK:
                new_sentence_list = self.sort_sentence_by_select_part_entity_as_bridge(query_text,
                                                                                       qa_info_manager=qa_info_manager,
                                                                                       weight_context_sim=weight_context_sim,
                                                                                       weight_graph_sim=weight_graph_sim,
                                                                                       )


            result_list = qa_info_manager.fill_api_id_in_result_list(new_sentence_list[:sentence_limit])

            self._logger.info("result_list =%d " % len(result_list))

            return result_list
        except Exception:
            self._logger.exception("----qaexception----")
            traceback.print_exc()
            return []

    def get_candidate_sentences(self, query_text, each_np_candidate_entity_num=20):

        chunk_list = self.get_chunk_from_text(query_text)
        print("chunk num=%d %s" % (len(chunk_list), ",".join(chunk_list)))

        qa_info_manager = self.search_entity_by_fulltext(chunk_list, each_np_candidate_entity_num)
        qa_info_manager.start_create_node_info_collection()

        print("related entity for qa", qa_info_manager)

        entity_for_qa_list = qa_info_manager.get_all_entity_for_qa_list()
        print("entity_for_qa_list num=%d" % len(entity_for_qa_list))

        sentence_list = self.search_sentence_by_entity_for_qa_list(entity_for_qa_list)
        print("sentence_list num=%d" % len(sentence_list))
        qa_info_manager.add_sentence_node_list(sentence_list)

        return qa_info_manager

    def expand_the_chunk_by_words(self, final_chunk_list):
        final_set = []
        for chunk in final_chunk_list:
            final_set.append(chunk)
            for word in chunk.split(" "):
                final_set.append(word)
        print("word set", final_set)
        return list(set(final_set))

    def get_chunk_from_text(self, text):

        final_chunk_list = self._entity_extractor.get_all_possible_key_word_from_text(text)

        return final_chunk_list

    def search_entity_by_fulltext(self, chunk_list, each_np_candidate_entity_num=20):
        qa_info_manager = QACacheInfoManager(semanticSearchAccessor=self.semanticSearchAccessor,
                                             defaultSearchAccessor=self.defaultAccessor,
                                             kg_models=self.kg_models)
        for chunk in chunk_list:
            related_entity_list = self.qa_searcher.search_related_entity(chunk, each_np_candidate_entity_num)
            qa_info_manager.add(chunk, related_entity_list)
            related_entity_for_api = self.qa_searcher.search_related_entity_for_api(chunk, each_np_candidate_entity_num)
            qa_info_manager.add(chunk, related_entity_for_api)
        return qa_info_manager

    def search_all_entity_by_fulltext_by_half(self, chunk, each_np_candidate_entity_num=20):
        qa_info_manager = QACacheInfoManager(semanticSearchAccessor=self.semanticSearchAccessor,
                                             defaultSearchAccessor=self.defaultAccessor,
                                             kg_models=self.kg_models)

        related_entity_for_api = self.qa_searcher.search_related_entity_for_api(chunk, each_np_candidate_entity_num/2)
        qa_info_manager.add(chunk, related_entity_for_api)

        related_entity_list = self.qa_searcher.search_related_entity(chunk, each_np_candidate_entity_num/2)
        qa_info_manager.add(chunk, related_entity_list)

        return qa_info_manager

    def search_sentence_by_entity_for_qa_list(self, entity_for_qa_list):
        entity_id_string_list = [str(entity_for_qa.kg_id) for entity_for_qa in entity_for_qa_list]
        entity_id_string_list = list(set(entity_id_string_list))
        return self.semanticSearchAccessor.search_sentence_by_entity_list(entity_id_string_list=entity_id_string_list)

    def get_relation_by_nodes(self, node_list):
        return self.semanticSearchAccessor.get_nodes_relation(node_list)

    def sort_sentence_by_entities_as_bridge(self, question,
                                            sentence_list,
                                            entity_list,
                                            weight_context_sim=0.5,
                                            weight_graph_sim=0.5
                                            ):
        self._logger.info("run sort_sentence_by_entities_as_bridge get result=%d" % len(sentence_list))

        question_vec = self.kg_models.get_question_entity_vector(question)

        entity_vec_list, entity_graph_vec_list = self.kg_models.get_vectors_for_entity_list(entity_list)
        sentence_vec_list, sentence_graph_vec_list = self.kg_models.get_vectors_for_entity_list(sentence_list)

        qe_sim_np = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(question_vec, entity_vec_list)
        qe_sim_np = qe_sim_np / qe_sim_np.sum()

        kg_context_sim = MatrixCalculation.compute_cossin_for_matrix_to_matrix_normalize(entity_vec_list,
                                                                                         sentence_vec_list)
        kg_graph_sim = MatrixCalculation.compute_cossin_for_matrix_to_matrix_normalize(entity_graph_vec_list,
                                                                                       sentence_graph_vec_list)

        qs_context_sim = weight_context_sim * qe_sim_np * kg_context_sim
        qs_graph_sim = weight_graph_sim * qe_sim_np * kg_graph_sim

        qs_sim = qs_context_sim + qs_graph_sim
        qs_sim = qs_sim.tolist()[0]
        qs_context_sim = qs_context_sim.tolist()[0]
        qs_graph_sim = qs_graph_sim.tolist()[0]

        for sum_sim, sentence, context_sim, graph_sim in zip(qs_sim, sentence_list, qs_context_sim, qs_graph_sim):
            sentence["qs_sim"] = sum_sim
            sentence["qs_context_sim"] = context_sim
            sentence["qs_graph_sim"] = graph_sim

        result = []
        for sentence in sentence_list:
            result.append({
                "kg_id": self.defaultAccessor.get_id_for_node(sentence),
                "sentence_id": sentence["sentence_id"],
                "sentence_type": sentence["sentence_type_code"],
                "text": sentence["sentence_text"],
                "qs_sim": sentence["qs_sim"],
                "qs_context_sim": sentence["qs_context_sim"],
                "qs_graph_sim": sentence["qs_graph_sim"]

            })
        self._logger.info("run sort_sentence_by_entities_as_bridge get result num=%d" % len(result))
        result.sort(key=lambda k: (k.get('qs_sim', 0)), reverse=True)

        return result

    def sort_sentence_by_entities_for_graph_similarity_as_bridge(self, question,
                                                                 sentence_list,
                                                                 entity_list,
                                                                 weight_context_sim=0.5,
                                                                 weight_graph_sim=0.5
                                                                 ):
        self._logger.info(
            "run sort_sentence_by_entities_for_graph_similarity_as_bridge get result=%d" % len(sentence_list))

        question_context_vec = self.kg_models.get_question_entity_vector(question)

        entity_vec_list, entity_graph_vec_list = self.kg_models.get_vectors_for_entity_list(entity_list)
        sentence_vec_list, sentence_graph_vec_list = self.kg_models.get_vectors_for_entity_list(sentence_list)

        qe_sim_np = np.ones((1, len(entity_list)))
        qe_sim_np = qe_sim_np / qe_sim_np.sum()
        qs_context_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(question_context_vec,
                                                                                      sentence_vec_list)

        kg_graph_sim = MatrixCalculation.compute_cossin_for_matrix_to_matrix_normalize(entity_graph_vec_list,
                                                                                       sentence_graph_vec_list)

        qs_context_sim = weight_context_sim * qs_context_sim
        qs_graph_sim = weight_graph_sim * qe_sim_np * kg_graph_sim

        qs_sim = qs_context_sim + qs_graph_sim
        qs_sim = qs_sim.tolist()[0]
        qs_context_sim = qs_context_sim.tolist()[0]
        qs_graph_sim = qs_graph_sim.tolist()[0]

        for sum_sim, sentence, context_sim, graph_sim in zip(qs_sim, sentence_list, qs_context_sim, qs_graph_sim):
            sentence["qs_sim"] = sum_sim
            sentence["qs_context_sim"] = context_sim
            sentence["qs_graph_sim"] = graph_sim

        result = []
        for sentence in sentence_list:
            result.append({
                "kg_id": self.defaultAccessor.get_id_for_node(sentence),
                "sentence_id": sentence["sentence_id"],
                "sentence_type": sentence["sentence_type_code"],
                "text": sentence["sentence_text"],
                "qs_sim": sentence["qs_sim"],
                "qs_context_sim": sentence["qs_context_sim"],
                "qs_graph_sim": sentence["qs_graph_sim"]

            })
        self._logger.info("run sort_sentence_by_entities_as_bridge get result num=%d" % len(result))
        result.sort(key=lambda k: (k.get('qs_sim', 0)), reverse=True)

        print("sorted result")
        for t in result:
            print("test sort", t)
        print(result[:100])

        return result

    def sort_sentence_by_build_average_graph_vector_for_query(self, question, sentence_list, entity_list,
                                                              weight_context_sim=0.5, weight_graph_sim=0.5
                                                              ):
        self._logger.info(
            "run sort_sentence_by_build_average_graph_vector_for_query get sentence_list=%d" % len(sentence_list))

        kg_models = self.kg_models
        question_context_vec = kg_models.get_question_entity_vector(question)
        entity_vec_list, entity_graph_vec_list = self.kg_models.get_vectors_for_entity_list(entity_list)
        sentence_vec_list, sentence_graph_vec_list = self.kg_models.get_vectors_for_entity_list(sentence_list)

        entity_list, entity_vec_list, entity_graph_vec_list = self.remove_the_not_related_entity(entity_graph_vec_list,
                                                                                                 entity_list,
                                                                                                 entity_vec_list,
                                                                                                 question_context_vec)

        query_graph_vector = kg_models.get_question_graph_vector_by_average_all_entities(
            question=question,
            entity_graph_vec_list=entity_graph_vec_list)

        qs_context_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(question_context_vec,
                                                                                      sentence_vec_list)

        qs_graph_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(query_graph_vector,
                                                                                    sentence_graph_vec_list)
        qs_context_sim = weight_context_sim * qs_context_sim

        qs_graph_sim = weight_graph_sim * qs_graph_sim

        qs_sim = qs_context_sim + qs_graph_sim

        qs_sim = qs_sim.tolist()[0]
        qs_context_sim = qs_context_sim.tolist()[0]
        qs_graph_sim = qs_graph_sim.tolist()[0]

        for sum_sim, sentence, context_sim, graph_sim in zip(qs_sim, sentence_list, qs_context_sim, qs_graph_sim):
            sentence["qs_sim"] = sum_sim
            sentence["qs_context_sim"] = context_sim
            sentence["qs_graph_sim"] = graph_sim

        result = []
        for sentence in sentence_list:
            result.append({
                "kg_id": self.defaultAccessor.get_id_for_node(sentence),
                "sentence_id": sentence["sentence_id"],
                "text": sentence["sentence_text"],
                "sentence_type": sentence["sentence_type_code"],
                "qs_sim": sentence["qs_sim"],
                "qs_context_sim": sentence["qs_context_sim"],
                "qs_graph_sim": sentence["qs_graph_sim"]

            })
        self._logger.info("run sort_sentence_by_build_average_graph_vector_for_query get result num=%d" % len(result))
        result.sort(key=lambda k: (k.get('qs_sim', 0)), reverse=True)

        return result

    def sort_sentence_by_select_part_entity_as_bridge(self, question,
                                                      qa_info_manager,
                                                      # sentence_list,
                                                      # entity_list,
                                                      weight_context_sim=0.6,
                                                      weight_graph_sim=0.4,
                                                      # chunk_to_related_entity_list_map=None,
                                                      ):
        self._logger.info(
            "run sort part entity result=%d" % qa_info_manager.get_sentence_size())

        print("entity for node")
        qa_info_manager.print_entities()
        print("sentence for node")
        # qa_info_manager.print_sentences()

        entity_info_collection = qa_info_manager.get_entity_info_collection()
        sentence_info_collection = qa_info_manager.get_sentence_info_collection()
        entity_info_collection.init_vectors(self.kg_models)
        sentence_info_collection.init_vectors(self.kg_models)
        sentence_list = sentence_info_collection.get_entity_list()
        entity_vec_list = entity_info_collection.get_entity_context_list()
        entity_graph_vec_list = entity_info_collection.get_entity_graph_list()
        entity_list = entity_info_collection.get_entity_list()
        sentence_vec_list = sentence_info_collection.get_entity_context_list()
        sentence_graph_vec_list = sentence_info_collection.get_entity_graph_list()

        question_context_vec = self.kg_models.get_question_entity_vector(question)

        entity_list, entity_vec_list, entity_graph_vec_list = self.get_top_related_entity_info_list(
            question_context_vec=question_context_vec, qa_info_manager=qa_info_manager)

        # entity_list, entity_vec_list, entity_graph_vec_list = self.remove_the_not_related_entity_by_only_save_one_for_each(
        #     entity_graph_vec_list=entity_graph_vec_list, entity_vec_list=entity_vec_list, entity_list=entity_list,
        #     question_context_vec=question_context_vec,
        #     qa_info_manager=qa_info_manager
        #
        # )

        qs_context_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(question_context_vec,
                                                                                      sentence_vec_list)
        # todo:change to the average graph similarity
        # qs_graph_sim = self.get_graph_similarity_by_average_entity_graph_vector(entity_graph_vec_list, question,
        #                                                                         sentence_graph_vec_list)

        qs_graph_sim = self.get_query_to_sentence_graph_sim_by_select_top_enttity(entity_graph_vec_list, entity_list,
                                                                                  entity_vec_list,
                                                                                  sentence_graph_vec_list,
                                                                                  sentence_vec_list)

        qs_context_sim = weight_context_sim * qs_context_sim
        qs_graph_sim = weight_graph_sim * qs_graph_sim

        qs_sim = qs_context_sim + qs_graph_sim
        qs_sim = qs_sim.tolist()[0]
        qs_context_sim = qs_context_sim.tolist()[0]
        qs_graph_sim = qs_graph_sim.tolist()[0]

        for sum_sim, sentence, context_sim, graph_sim in zip(qs_sim, sentence_list, qs_context_sim, qs_graph_sim):
            sentence["qs_sim"] = sum_sim
            sentence["qs_context_sim"] = context_sim
            sentence["qs_graph_sim"] = graph_sim

        result = []
        for sentence in sentence_list:
            result.append({
                "kg_id": self.defaultAccessor.get_id_for_node(sentence),
                "sentence_id": sentence["sentence_id"],
                "sentence_type": sentence["sentence_type_code"],
                "text": sentence["sentence_text"],
                "qs_sim": sentence["qs_sim"],
                "qs_context_sim": sentence["qs_context_sim"],
                "qs_graph_sim": sentence["qs_graph_sim"]

            })
        self._logger.info("run sort_sentence_by_entities_as_bridge get result num=%d" % len(result))
        result.sort(key=lambda k: (k.get('qs_sim', 0)), reverse=True)

        print(result[:100])

        return result

    def get_graph_similarity_by_average_entity_graph_vector(self, entity_graph_vec_list, question,
                                                            sentence_graph_vec_list):
        query_graph_vector = self.kg_models.get_question_graph_vector_by_average_all_entities(
            question=question,
            entity_graph_vec_list=entity_graph_vec_list)
        qs_graph_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(query_graph_vector,
                                                                                    sentence_graph_vec_list)
        return qs_graph_sim

    def get_graph_similarity_average_entity_graph_vector_similarity(self, entity_graph_vec_list, question,
                                                                    sentence_graph_vec_list):
        # query_graph_vector = self.kg_models.get_question_graph_vector_by_average_all_entities(
        #     question=question,
        #     entity_graph_vec_list=entity_graph_vec_list)
        qs_graph_sim = MatrixCalculation.compute_cossin_for_matrix_to_matrix_normalize(sentence_graph_vec_list,
                                                                                       entity_graph_vec_list)
        return np.mean(qs_graph_sim, axis=1)

    def get_query_to_sentence_graph_sim_by_select_top_enttity(self, entity_graph_vec_list, entity_list, entity_vec_list,
                                                              sentence_graph_vec_list, sentence_vec_list):
        # kg_se_graph_sim = MatrixCalculation.compute_cossin_for_matrix_to_matrix_normalize(sentence_graph_vec_list,
        #                                                                                   entity_graph_vec_list,
        #                                                                                   )
        kg_se_context_sim = MatrixCalculation.compute_cossin_for_matrix_to_matrix_normalize(
            sentence_vec_list,
            entity_vec_list)
        # TODO
        # kg_se_sim = 0.5 * kg_se_graph_sim + 0.5 * kg_se_context_sim
        kg_se_sim = kg_se_context_sim

        print("final entity list", len(entity_list), entity_list)
        select_linking_entity_num = min(5, len(entity_list))
        onehot_maxsim_se_matrix = MatrixCalculation.get_most_similar_top_n_entity_as_matrix(
            top_n=select_linking_entity_num, s_e_similarity_matrix=kg_se_sim)
        s_query_graph_vec_matrix = onehot_maxsim_se_matrix * np.matrix(
            entity_graph_vec_list) / select_linking_entity_num
        qs_graph_sim = MatrixCalculation.compute_cossin_for_one_to_one_in_two_list_normalize(sentence_graph_vec_list,
                                                                                             s_query_graph_vec_matrix.getA())
        return qs_graph_sim

    def remove_the_not_related_entity_by_only_save_one_for_each(self, entity_graph_vec_list, entity_list,
                                                                entity_vec_list, question_context_vec,
                                                                qa_info_manager):
        chunk_to_related_entity_list_map = qa_info_manager.keyword_2_entitynodemap
        qe_sim_np = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(question_context_vec,
                                                                                 entity_vec_list)

        entity_info_sumary_list = []
        for (entity, sim, entity_vec, entity_graph_vec) in zip(entity_list, qe_sim_np.getA()[0], entity_vec_list,
                                                               entity_graph_vec_list):
            print("after first removing sim=", sim, "entity=", entity)
            entity_info_sumary_list.append({"entity": entity,
                                            "sim": sim,
                                            "entity_vec": entity_vec,
                                            "entity_graph_vec": entity_graph_vec
                                            })

        entity_info_sumary_list.sort(key=lambda k: (k.get('sim', 0)), reverse=True)

        valid_word_set = set([])
        word_to_related_entity_list_map = {}

        for chunk, related_entity_list in chunk_to_related_entity_list_map.items():
            word = chunk
            if word not in valid_word_set:
                valid_word_set.add(word)
                word_to_related_entity_list_map[word] = related_entity_list
            else:
                word_to_related_entity_list_map[word].extend(related_entity_list)

        # clean_entity_info_list = self.get_clean_entity_for_each_word_by_max_similarity(entity_info_sumary_list,
        #                                                                                word_to_related_entity_list_map)
        #
        clean_entity_info_list = self.get_clean_entity_for_each_word_by_max_n_similarity(entity_info_sumary_list,
                                                                                         word_to_related_entity_list_map)

        new_entity_list = []
        new_entity_graph_vec_list = []
        new_entity_vec_list = []
        for entity_info_sumary in clean_entity_info_list:
            new_entity_list.append(entity_info_sumary["entity"])
            new_entity_graph_vec_list.append(entity_info_sumary["entity_graph_vec"])
            new_entity_vec_list.append(entity_info_sumary["entity_vec"])
            print("final save sim=", entity_info_sumary["sim"], "entity=", entity_info_sumary["entity"])

        return new_entity_list, new_entity_vec_list, new_entity_graph_vec_list

    def get_top_related_entity_info_list(self, question_context_vec,
                                         qa_info_manager):

        node_info_collection = qa_info_manager.get_node_info_collection()
        node_info_collection.fill_each_entity_with_similary_to_question(question_context_vec)
        node_info_collection.sort_by_qe_sim()

        # selected_entity_info_list = qa_info_manager.get_top_node_info_by_each_keywords_three_different_type()
        selected_entity_info_list = qa_info_manager.get_top_node_info_by_each_keywords()

        new_entity_list = []
        new_entity_vec_list = []
        new_entity_graph_vec_list = []
        for node_info in selected_entity_info_list:
            new_entity_list.append(node_info.entity_node)
            new_entity_vec_list.append(node_info.entity_context_vec)
            new_entity_graph_vec_list.append(node_info.entity_graph_vec)

        return new_entity_list, new_entity_vec_list, new_entity_graph_vec_list

    def get_clean_entity_for_each_word_by_max_n_similarity(self, entity_info_sumary_list,
                                                           word_to_related_entity_list_map):
        clean_entity_kg_id_list = set([])
        print("start get_clean_entity_infi_sumary_list ")
        word_name_entity_mark = {}
        for valid_word, related_entity_list in word_to_related_entity_list_map.items():
            print("valid word=", valid_word)

            entity_info_list = self.get_first_from_entity_info_sumary_list_and_in_related_entity_list(
                entity_info_sumary_list, related_entity_list, 3)

            # for entity_info in entity_info_list:
            print("get candidate for word=", valid_word, entity_info_list)
            word_name_entity_mark[valid_word] = entity_info_list

            clean_entity_info_list = []
            clean_entity_kg_id_list = set([])

            for word, entity_info_list in word_name_entity_mark.items():
                for entity_info in entity_info_list:
                    kg_id = self.defaultAccessor.get_id_for_node(entity_info["entity"])
                    if kg_id not in clean_entity_kg_id_list:
                        clean_entity_info_list.append(entity_info)
                        clean_entity_kg_id_list.add(kg_id)
                        print("valid word=", word, entity_info["entity"])
        return clean_entity_info_list

    def get_clean_entity_for_each_word_by_max_similarity(self, entity_info_sumary_list,
                                                         word_to_related_entity_list_map):
        clean_entity_kg_id_list = set([])
        print("start get_clean_entity_infi_sumary_list ")
        word_name_entity_mark = {}
        for valid_word, related_entity_list in word_to_related_entity_list_map.items():
            print("valid word=", valid_word)

            entity_info_list = self.get_first_from_entity_info_sumary_list_and_in_related_entity_list(
                entity_info_sumary_list, related_entity_list)

            for entity_info in entity_info_list:
                print("get candidate for word=", valid_word, entity_info["entity"])
                word_name_entity_mark[valid_word] = entity_info

            clean_entity_info_list = []
            clean_entity_kg_id_list = set([])

            for word, entity_info in word_name_entity_mark.items():
                kg_id = self.defaultAccessor.get_id_for_node(entity_info["entity"])
                if kg_id not in clean_entity_kg_id_list:
                    clean_entity_info_list.append(entity_info)
                    clean_entity_kg_id_list.add(kg_id)
                    print("valid word=", word, entity_info["entity"])
        return clean_entity_info_list

    def get_clean_entity_infi_sumary_list(self, entity_info_sumary_list, word_to_related_entity_list_map):
        clean_entity_kg_id_list = set([])
        print("start get_clean_entity_infi_sumary_list ")
        word_name_entity_mark = {}
        for valid_word, related_entity_list in word_to_related_entity_list_map.items():
            print("valid word=", valid_word)

            entity_info_list = self.get_first_from_entity_info_sumary_list_and_in_related_entity_list(
                entity_info_sumary_list, related_entity_list)

            for entity_info in entity_info_list:
                kg_id = self.defaultAccessor.get_id_for_node(entity_info["entity"])
                print("get candidate for word=", valid_word, entity_info["entity"])

                if kg_id not in clean_entity_kg_id_list:
                    if valid_word not in word_name_entity_mark.keys():
                        word_name_entity_mark[valid_word] = entity_info
                    else:
                        old_entity_info = word_name_entity_mark[valid_word]
                        if entity_info["sim"] > old_entity_info["sim"]:
                            word_name_entity_mark[valid_word] = entity_info

                    for seperate_name in valid_word.split(" "):
                        if seperate_name not in word_name_entity_mark.keys():
                            word_name_entity_mark[seperate_name] = entity_info
                        else:
                            old_entity_info = word_name_entity_mark[seperate_name]
                            if entity_info["sim"] > old_entity_info["sim"]:
                                word_name_entity_mark[seperate_name] = entity_info

                clean_entity_kg_id_list.add(kg_id)

            clean_entity_info_list = []
            clean_entity_kg_id_list = set([])

            for word, entity_info in word_name_entity_mark.items():
                kg_id = self.defaultAccessor.get_id_for_node(entity_info["entity"])
                if kg_id not in clean_entity_kg_id_list:
                    clean_entity_info_list.append(entity_info)
                    clean_entity_kg_id_list.add(kg_id)
                    print("valid word=", word, entity_info["entity"])
        return clean_entity_info_list

    def remove_the_not_related_entity(self, entity_graph_vec_list, entity_list, entity_vec_list, question_context_vec):

        qe_sim_np = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(question_context_vec,
                                                                                 entity_vec_list)
        print("qeustion to entity similary")

        new_entity_list = []
        new_entity_vec_list = []
        new_entity_graph_vec_list = []
        qe_sim_clean = []
        for (entity, sim, entity_vec, entity_graph_vec) in zip(entity_list, qe_sim_np.getA()[0], entity_vec_list,
                                                               entity_graph_vec_list):
            print("sim=", sim, "entity=", entity)
            if sim > MIN_RELATED_ENTITY_SIMILARITY:
                print("adding ", entity)
                new_entity_list.append(entity)
                new_entity_vec_list.append(entity_vec)
                new_entity_graph_vec_list.append(entity_graph_vec)
                qe_sim_clean.append(sim)

        entity_list = new_entity_list
        entity_vec_list = new_entity_vec_list
        entity_graph_vec_list = new_entity_graph_vec_list

        new_entity_list = []
        new_entity_vec_list = []
        new_entity_graph_vec_list = []

        entity_info_sumary_list = []

        for (entity, sim, entity_vec, entity_graph_vec) in zip(entity_list, qe_sim_clean, entity_vec_list,
                                                               entity_graph_vec_list):
            print("after first removing sim=", sim, "entity=", entity)
            entity_info_sumary_list.append({"entity": entity,
                                            "sim": sim,
                                            "entity_vec": entity_vec,
                                            "entity_graph_vec": entity_graph_vec
                                            })

        entity_info_sumary_list.sort(key=lambda k: (k.get('sim', 0)), reverse=True)

        api_class_name_set = set([])

        new_entity_info_sumary_list = []
        for entity_info_sumary in entity_info_sumary_list:
            if entity_info_sumary["entity"].has_label("api"):
                qualified_name = entity_info_sumary["entity"]["qualified_name"]
                if qualified_name in api_class_name_set:
                    continue
                if "(" in qualified_name:
                    simple_name = qualified_name.split("(")[0]

                    class_name = ".".join(simple_name.split(".")[:-1])
                    if class_name in api_class_name_set:
                        continue
                    else:
                        api_class_name_set.add(class_name)
                        new_entity_info_sumary_list.append(entity_info_sumary)
                else:
                    api_class_name_set.add(qualified_name)
                    new_entity_info_sumary_list.append(entity_info_sumary)
            else:
                new_entity_info_sumary_list.append(entity_info_sumary)

        for entity_info_sumary in new_entity_info_sumary_list:
            new_entity_list.append(entity_info_sumary["entity"])
            new_entity_graph_vec_list.append(entity_info_sumary["entity_graph_vec"])
            new_entity_vec_list.append(entity_info_sumary["entity_vec"])
            print("final save sim=", entity_info_sumary["sim"], "entity=", entity_info_sumary["entity"])

        return new_entity_list, new_entity_vec_list, new_entity_graph_vec_list

    def sort_sentence_by_build_graph_vector_for_query_in_semantic_weight(self, question, sentence_list, entity_list,
                                                                         weight_context_sim=0.5, weight_graph_sim=0.5
                                                                         ):

        self._logger.info(
            "run sort_sentence_by_build_graph_vector_for_query_in_semantic_weight get sentence_list=%d" % len(
                sentence_list))

        kg_models = self.kg_models
        question_context_vec = kg_models.get_question_entity_vector(question)
        entity_vec_list, entity_graph_vec_list = self.kg_models.get_vectors_for_entity_list(entity_list)
        sentence_vec_list, sentence_graph_vec_list = self.kg_models.get_vectors_for_entity_list(sentence_list)

        query_graph_vector = kg_models.get_question_graph_vector_by_semantic_weight_all_entities(
            question_context_vec=question_context_vec,
            entity_context_vec_list=entity_vec_list,
            entity_graph_vec_list=entity_graph_vec_list)

        qs_context_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(question_context_vec,
                                                                                      sentence_vec_list)

        qs_graph_sim = MatrixCalculation.compute_cossin_for_vec_to_matrix_normalize(query_graph_vector,
                                                                                    sentence_graph_vec_list)
        qs_context_sim = weight_context_sim * qs_context_sim

        qs_graph_sim = weight_graph_sim * qs_graph_sim

        qs_sim = qs_context_sim + qs_graph_sim

        qs_sim = qs_sim.tolist()[0]
        qs_context_sim = qs_context_sim.tolist()[0]
        qs_graph_sim = qs_graph_sim.tolist()[0]

        for sum_sim, sentence, context_sim, graph_sim in zip(qs_sim, sentence_list, qs_context_sim, qs_graph_sim):
            sentence["qs_sim"] = sum_sim
            sentence["qs_context_sim"] = context_sim
            sentence["qs_graph_sim"] = graph_sim

        result = []
        for sentence in sentence_list:
            result.append({
                "kg_id": self.defaultAccessor.get_id_for_node(sentence),
                "sentence_id": sentence["sentence_id"],
                "text": sentence["sentence_text"],
                "qs_sim": sentence["qs_sim"],
                "qs_context_sim": sentence["qs_context_sim"],
                "qs_graph_sim": sentence["qs_graph_sim"]
            })
        self._logger.info(
            "run sort_sentence_by_build_graph_vector_for_query_in_semantic_weight get result=%d" % len(result))
        result.sort(key=lambda k: (k.get('qs_sim', 0)), reverse=True)

        return result

    def get_all_entity(self, entity_for_qa_list):
        entity_id_string_list = [str(entity_for_qa.kg_id) for entity_for_qa in entity_for_qa_list]
        entity_id_string_list = list(set(entity_id_string_list))
        return self.semanticSearchAccessor.get_all_entity(entity_id_string_list=entity_id_string_list)

    def get_first_from_entity_info_sumary_list_and_in_related_entity_list(self, entity_info_sumary_list,
                                                                          related_entity_list, top_relate_entity_num=1):
        return_result_list = []
        for entity_info in entity_info_sumary_list:
            kg_id = self.defaultAccessor.get_id_for_node(entity_info["entity"])
            entity = self.get_entity_from_entity_list_by_kgid(kg_id, related_entity_list)
            if entity is not None:
                return_result_list.append(entity_info)
                if len(return_result_list) >= top_relate_entity_num:
                    return return_result_list
        return []

    def get_entity_from_entity_list_by_kgid(self, kg_id, related_entity_list):
        for related_entity in related_entity_list:
            if related_entity.kg_id == kg_id:
                return related_entity
        return None
Ejemplo n.º 17
0
from skgraph.graph.label_util import LabelUtil
from skgraph.graph.node_cleaner import GraphJsonParser

reload(sys)
sys.setdefaultencoding("utf-8")

app = Flask(__name__)
CORS(app)
db_handler = SQLAlchemyHandler()
db_handler.setLevel(logging.WARN)  # Only serious messages
app.logger.addHandler(db_handler)

logger = Logger("neo4jServer").get_log()
logger.info("create logger")

graphClient = DefaultGraphAccessor(GraphClient(server_number=1))
logger.info("create graphClient")

api_entity_linker = APIEntityLinking()
logger.info("create api_entity_linker object")

questionAnswerSystem = QuestionAnswerSystem()
logger.info("create questionAnswerSystem")

dbSOPostSearcher = SOPostSearcher(EngineFactory.create_so_session(),
                                  logger=app.logger)
logger.info("create SO POST Searcher")

api_entity_session = EngineFactory.create_session(autocommit=True)
apiSearcher = APISearcher(session=api_entity_session, logger=app.logger)
logger.info("create API Searcher")
from skgraph.graph.accessor.graph_accessor import DefaultGraphAccessor, GraphClient

client = DefaultGraphAccessor(GraphClient(server_number=1))
relation_type_list = client.get_all_relation_type()
for relation_name in relation_type_list:
    client.create_relation_node(relation_name=relation_name)
label_list = client.get_all_label_list()
remove_lables = ["wall", "wikidata", "wd_property", "relation", "schema"]
for label in label_list:
    if label not in remove_lables:
        node = client.find_a_node_by_label(label)
        if node is not None:
            for property in node.keys():
                client.create_relation_node(relation_name=property)
Ejemplo n.º 19
0
import codecs
import json

from py2neo import Relationship

from skgraph.graph.accessor.graph_accessor import GraphClient, DefaultGraphAccessor
from skgraph.graph.accessor.graph_client_for_awesome import AwesomeGraphAccessor
from skgraph.graph.accessor.graph_client_for_wikipedia import WikipediaGraphAccessor
from shared.logger_util import Logger

_logger = Logger("AwesomeImporter").get_log()

awesomeGraphAccessor = AwesomeGraphAccessor(GraphClient(server_number=0))
wikipediaGraphAccessor = WikipediaGraphAccessor(awesomeGraphAccessor)
defaultGraphAccessor = DefaultGraphAccessor(awesomeGraphAccessor)

baseGraphClient = awesomeGraphAccessor.graph

file_name = "awesome_item_category_related_to_wikipedia_relation_list.json"
with codecs.open(file_name, 'r', 'utf-8') as f:
    relation_list = json.load(f)
for tag_relation in relation_list:
    start_entity_name = tag_relation["start_entity_name"]
    relation = tag_relation["relation"]
    end_url = tag_relation["end_url"]
    start_node = awesomeGraphAccessor.find_awesome_cate_by_name(
        start_entity_name)
    end_node = defaultGraphAccessor.get_node_by_wikipedia_link(end_url)
    if end_node is None:
        end_node = wikipediaGraphAccessor.create_wikipedia_item_entity_by_url(
            end_url)
 def setUp(self):
     self.graphClient = DefaultGraphAccessor(GraphClient())
     self.nodeCleaner = NodeCleaner()
Ejemplo n.º 21
0
class SearchUtil:
    def __init__(self, graph_client, api_searcher):
        self.graph_accessor = DefaultGraphAccessor(graph_client)
        self.api_searcher = api_searcher

    def search(self, keywords, top_number):
        result_node_list = []
        jobs = []

        api_db_search_job = gevent.spawn(
            self.api_searcher.search_api_entity_with_order, keywords,
            top_number)
        jobs.append(api_db_search_job)
        graph_search_job = gevent.spawn(
            self.graph_accessor.search_nodes_by_name_in_list, keywords,
            top_number)
        jobs.append(graph_search_job)

        gevent.joinall(jobs, timeout=2000)
        api_entity_list = api_db_search_job.value

        api_id_list = []
        for api_entity in api_entity_list:
            api_id_list.append(api_entity.id)
        api_node_list = self.graph_accessor.get_api_entity_map_to_node(
            api_id_list)
        for api_node in api_node_list:
            if api_node not in result_node_list:
                result_node_list.append(api_node)

        graph_node_result_list = graph_search_job.value
        for graph_node in graph_node_result_list:
            if graph_node not in result_node_list:
                result_node_list.append(graph_node)
        ## todo, change the node search to a more generate way, for example, a scorer is necessary
        node_score = {}
        for node in result_node_list:
            node_id = self.graph_accessor.get_id_for_node(node)
            node_score[node_id] = 0
        for node in result_node_list:
            node_id = self.graph_accessor.get_id_for_node(node)
            if node in api_node_list and node in graph_node_result_list:
                node_score[node_id] = node_score[node_id] + 10
            if node.has_label("extended knowledge"):
                node_score[node_id] = node_score[node_id] - 3
            if node.has_label("java class") or node.has_label("wikidata"):
                node_score[node_id] = node_score[node_id] + 1
            if node.has_label("java constructor"):
                node_score[node_id] = node_score[node_id] - 1

        left_nodes = []
        for node in result_node_list:
            left_nodes.append(node)

        sorted_node_list = []
        while len(left_nodes) > 0:
            max_score = -10000
            max_node = None
            for node in left_nodes:
                node_id = self.graph_accessor.get_id_for_node(node)
                if node_score[node_id] > max_score:
                    max_score = node_score[node_id]
                    max_node = node
            sorted_node_list.append(max_node)
            left_nodes.remove(max_node)

        return sorted_node_list[:top_number]
Ejemplo n.º 22
0
# -*- coding:utf8 -*-
import sys
import nltk
from skgraph.graph.accessor.graph_accessor import GraphClient
from skgraph.graph.accessor.graph_accessor import DefaultGraphAccessor

reload(sys)
sys.setdefaultencoding('utf8')

graph_client = GraphClient(server_number=0)
graph_accessor = DefaultGraphAccessor(graph_client)
'''
merge node and relation belong to extended knowledge and entity which name 
'''


class DuplicateCleaner:
    def merge_node_with_same_name(self):
        print 'begin merge node with same name'
        query = "match(a:`extended knowledge`:entity) return id(a) as id,a.name as name"
        try:
            nodeList = []
            result = graph_accessor.graph.run(query)
            for n in result:
                node = (n['id'], n['name'])
                nodeList.append(node)
        except Exception:
            return []
        nodeDict = {}
        a = 1
        for n in nodeList:
import threading
from Queue import Queue

from py2neo import Node

from skgraph.graph.accessor.graph_accessor import DefaultGraphAccessor, GraphClient
from skgraph.graph.accessor.graph_client_for_rwr import RandomWalkGraphAccessor
from skgraph.graph.operation.generateDataPpi import DataCreateUtil
from skgraph.graph.operation.random_walk_restart import Walker

log_file = open('log_single.txt', 'w')
input_file_name = 'data.ppi'
node_id = 53157
client_1 = DefaultGraphAccessor(GraphClient(server_number=1))
client_2 = DefaultGraphAccessor(GraphClient(server_number=2))
data_util = DataCreateUtil(client_1)
end_status = 0


class Producer(threading.Thread):
    def __init__(self, queue):
        threading.Thread.__init__(self)
        self.data = queue

    def run(self):
        print 'begin produce'
        log_file.writelines('begin  %d\n' % (node_id))
        log_file.flush()
        if client_1.find_node_by_id(node_id) == None:
            return 0
        if data_util.createData(node_id) == 1:
def awesome_item_rename_duplicate(awesome_graph_accessor, node_collection):
    node_list = node_collection.get_all_nodes(1000, ['awesome item'])
    print len(node_list)

    node_map = construct_key_count_map(node_list)

    i = 0
    for key in node_map.keys():
        if len(node_map[key]) > 1:
            print key, " ", len(node_map[key])
            for each in node_map[key]:
                print each
            i += 1
    print i

    node_list_after_step1 = []
    for key in node_map.keys():
        if len(node_map[key]) > 1:
            step1_node_list = node_map[key]
            for node in step1_node_list:
                if "url" in dict(node):
                    url = node["url"]
                    if "//github.com/" in url:
                        # print url, " ", type(url)
                        github_name = get_name_by_github_url(url)
                        if github_name != "" and github_name.lower() != key.lower():
                            node["name"] = github_name
                        node_list_after_step1.append(node)
                    else:
                        node_list_after_step1.append(node)
                else:
                    node_list_after_step1.append(node)

    for node in node_list_after_step1:
        print node
        awesome_graph_accessor.push_node(node)

    node_map_after_step1 = construct_key_count_map(node_list_after_step1)

    # i = 0
    # for key in node_map_after_step1.keys():
    #     if len(node_map_after_step1[key]) > 1:
    #         print key, " ", len(node_map_after_step1[key]), " ", node_map_after_step1[key]
    #         i += 1
    # print i

    # with open("node_map_after_step1.txt", 'w') as f:
    #     for key in node_map_after_step1.keys():
    #         if len(node_map_after_step1[key]) > 1:
    #             nodes_str = key + " " + str(len(node_map_after_step1[key])) + " " + str(node_map_after_step1[key])
    #             f.write(nodes_str + "\n")z

    node_list_after_step2 = []
    for key in node_map_after_step1.keys():
        step2_node_list = node_map_after_step1[key]
        for i in range(0, len(step2_node_list) - 1):
            for j in range(i + 1, len(step2_node_list)):
                if "url" in step2_node_list[i]:
                    url1 = step2_node_list[i]["url"]
                else:
                    url1 = ""

                if "url" in step2_node_list[j]:
                    url2 = step2_node_list[j]["url"]
                else:
                    url2 = ""

                if "description" in step2_node_list[i]:
                    description1 = step2_node_list[i]["description"]
                else:
                    description1 = ""

                if "description" in step2_node_list[j]:
                    description2 = step2_node_list[j]["description"]
                else:
                    description2 = ""

                if description1 != "" and description2 != "":
                    desc_sim = description_similarity(description1, description2)
                    if desc_sim >= 0.7:
                        step2_node_list[i].setdefault("duplicate", 1)
                        step2_node_list[j].setdefault("duplicate", 1)
                else:
                    url_sim = 0
                    if url1 != "" and url2 != "":
                        url_sim = url_similarity(url1, url2)
                    if url_sim >= 0.8:
                        step2_node_list[i].setdefault("duplicate", 2)
                        step2_node_list[j].setdefault("duplicate", 2)
                    else:
                        node1 = awesome_graph_accessor.find_start_by_relation_type_and_end_url("collect", url1)
                        node2 = awesome_graph_accessor.find_start_by_relation_type_and_end_url("collect", url2)
                        if node1 is not None and node2 is not None:
                            node_id1 = GraphAccessor.get_id_for_node(node1)
                            node_id2 = GraphAccessor.get_id_for_node(node2)
                            if node_id1 == node_id2:
                                step2_node_list[i].setdefault("duplicate", 3)
                                step2_node_list[j].setdefault("duplicate", 3)

        for each in step2_node_list:
            node_list_after_step2.append(each)

    node_map_after_step2 = construct_key_count_map(node_list_after_step2)
    # i = 0
    # for key in node_map_after_step2.keys():
    #     if len(node_map_after_step2[key]) > 1 and property_in_dict_list("duplicate", node_map_after_step2[key]) is True:
    #         print key, " ", len(node_map_after_step2[key]), " ", node_map_after_step2[key]
    #         i += 1
    # print i

    pending_map = {}
    duplicate_id_list = []
    for key in node_map_after_step2.keys():
        temp_list = []
        temp_id_list = []
        if len(node_map_after_step2[key]) > 1 and property_in_dict_list("duplicate", node_map_after_step2[key]) is True:
            for each in node_map_after_step2[key]:
                if "duplicate" in dict(each):
                    temp_list.append(each)
                    temp_id_list.append(DefaultGraphAccessor.get_id_for_node(each))
            pending_map.setdefault(key, temp_list)
            duplicate_id_list.append(temp_id_list)

    for key in pending_map.keys():
        pending_list = rename_property(pending_map[key])
        for each in pending_list:
            print each
            awesome_graph_accessor.push_node(each)
    print len(pending_map)

    with open("duplicate_id_list.txt", 'w') as f:
        f.write(str(duplicate_id_list))
Ejemplo n.º 25
0
 def setUp(self):
     self.graphClient = DefaultGraphAccessor(GraphClient())
     self.filter = NodeRelationFilter()
Ejemplo n.º 26
0
import threading
import time
from Queue import Queue

from py2neo import Node

from skgraph.graph.accessor.graph_accessor import DefaultGraphAccessor, GraphClient
from skgraph.graph.accessor.graph_client_for_rwr import RandomWalkGraphAccessor
from skgraph.graph.operation.generateDataPpi import DataCreateUtil
from skgraph.graph.operation.random_walk_restart import Walker

log_file = open('log.txt', 'w')
exception_file = open('exception.txt', 'a+')
input_file_name = 'data.ppi'
begin_id = 92354
client_1 = DefaultGraphAccessor(GraphClient(server_number=1))
client_2 = DefaultGraphAccessor(GraphClient(server_number=2))
data_util = DataCreateUtil(client_1)
max_id = client_1.get_max_id_for_node()
end_status = 0
print_status = 0


class Producer(threading.Thread):
    def __init__(self, queue, queue_length):
        threading.Thread.__init__(self)
        self.data = queue
        self.queue_length = queue_length

    def run(self):
        print 'begin produce'
Ejemplo n.º 27
0
from py2neo import Relationship

from skgraph.graph.accessor.graph_accessor import DefaultGraphAccessor, GraphClient
from graph_operation import GraphOperation

graphClient = DefaultGraphAccessor(GraphClient(server_number=1))


class JavaReturnValueTypeLinkerOperation(GraphOperation):
    name = 'JavaReturnValueTypeLinkerOperation'

    def operate(self, node):
        return_value_type = node["value type"]
        if return_value_type:
            type_node = graphClient.find_one_by_alias_name_property(
                "java class", return_value_type)
            if type_node is not None:
                relation = Relationship(node, "type of", type_node)
                graphClient.merge(relation)
        return node, node
 def test_rename_property(self):
     self.graphClient = DefaultGraphAccessor(GraphClient(server_number=0))
     node_list = self.graphClient.find_by_name_property("awesome item", "acl9")
     result = rename_property(node_list)
     print result
Ejemplo n.º 29
0
 def __init__(self, graph_client, api_searcher, api_semantic_search):
     self.graph_accessor = DefaultGraphAccessor(graph_client)
     self.api_searcher = api_searcher
     self.api_semantic_search = api_semantic_search
class TestGraphClient(TestCase):
    graphClient = None

    def setUp(self):
        self.graphClient = DefaultGraphAccessor(GraphClient())
        self.nodeCleaner = NodeCleaner()

    def test_get_max_id_for_node(self):
        self.assertEqual(self.graphClient.get_max_id_for_node(), 697753)

    def test_get_adjacent_node_id_list(self):
        self.assertEqual(self.graphClient.get_adjacent_node_id_list(66666666),
                         [])

        correct = [64289, 52628, 62565]

        self.assertEqual(self.graphClient.get_adjacent_node_id_list(7899),
                         correct)

    def test_get_node_name_by_id(self):
        self.assertEqual(self.graphClient.get_node_name_by_id(66666666), None)
        self.assertEqual(self.graphClient.get_node_name_by_id(3444),
                         "Adobe Device Central")

    def test_expand_node_for_directly_adjacent_nodes_to_subgraph(self):
        # self.assertEqual(self.graphClient.expand_node_for_adjacent_nodes_to_subgraph(3444),
        #                  "Adobe Device Central")
        pass

    def test_find_by_alias_name_property_exactly_match_from_label_limit_one(
            self):
        self.assertEqual(
            self.graphClient.find_one_by_alias_name_property(
                "entity", "Adobe Device Central"), None)
        interface = self.graphClient.find_one_by_alias_name_property(
            "api", "Interface PrintGraphics")
        self.assertEqual(93008, self.graphClient.get_id_for_node(interface))

    def test_find_by_alias_name_property(self):
        self.assertEqual(
            self.graphClient.find_by_alias_name_property(
                "entity", "Adobe Device Central"), [])
        interfaces = self.graphClient.find_by_alias_name_property(
            "api", "Interface PrintGraphics")
        self.assertEqual(len(interfaces), 1)
        self.assertEqual(93008,
                         self.graphClient.get_id_for_node(interfaces[0]))

    def test_get_relation_by_relation_id(self):
        relation = self.graphClient.get_relation_by_relation_id(470129)
        self.assertIsNone(relation)

        relation = self.graphClient.get_relation_by_relation_id(122211)

        self.assertEqual(122211, self.graphClient.get_id_for_node(relation))
        self.assertEqual(
            91, self.graphClient.get_id_for_node(relation.start_node()))
        self.assertEqual(29390,
                         self.graphClient.get_id_for_node(relation.end_node()))

        subgraph = self.graphClient.get_relations_between_two_nodes_in_subgraph(
            246029, 246030)
        relations_json = []
        for r in subgraph.relationships():
            r = {
                "id": self.graphClient.get_id_for_node(relation),
                "name": relation.type(),
                "start_id":
                self.graphClient.get_start_id_for_relation(relation),
                "end_id": self.graphClient.get_end_id_for_relation(relation)
            }
            print r
        subgraph = self.graphClient.get_relations_between_two_nodes_in_subgraph(
            246029, 246033)
        self.assertEqual(subgraph, None)

    def test_find_node_by_id(self):
        node = self.graphClient.find_node_by_id(5444)
        self.assertEqual(5444, self.graphClient.get_id_for_node(node))

    def test_search_nodes_by_name(self):
        nodes = self.graphClient.search_nodes_by_name("java")
        count = 0
        for n in nodes:
            count = count + 1
        self.assertEqual(10, count)

        nodes = self.graphClient.search_nodes_by_name("String buffer()")

        count = 0
        for n in nodes:
            count = count + 1
        self.assertEqual(10, count)

    def test_search_nodes_by_name_in_subgraph(self):
        subgraph = self.graphClient.search_nodes_by_name_in_subgraph("java")
        count = 0
        for n in subgraph.nodes():
            count = count + 1
        self.assertEqual(10, count)

        subgraph = self.graphClient.search_nodes_by_name_in_subgraph(
            "String buffer()")
        count = 0
        if subgraph is not None:
            for n in subgraph.nodes():
                count = count + 1
        self.assertEqual(10, count)

    def test_get_relations_between_two_nodes_in_subgraph(self):
        subgraph = self.graphClient.get_relations_between_two_nodes_in_subgraph(
            48, 3600)
        self.assertEqual(None, subgraph)

        subgraph = self.graphClient.get_relations_between_two_nodes_in_subgraph(
            48, 3643)

        self.assertEqual(2, len(subgraph.nodes()))
        self.assertEqual(1, len(subgraph.relationships()))

    def test_get_relations_between_two_nodes(self):
        record_list = self.graphClient.get_relations_between_two_nodes(
            48, 3600)
        count = 0
        for n in record_list:
            count = count + 1
        self.assertEqual(0, count)

        record_list = self.graphClient.get_relations_between_two_nodes_in_subgraph(
            48, 3643)

        count = 0
        for n in record_list:
            count = count + 1

        self.assertEqual(1, count)

    def test_cleaner(self):
        node = self.graphClient.find_node_by_id(444)
        self.assertEqual(self.nodeCleaner.get_clean_node_name(node),
                         "fake news")

        node = self.graphClient.find_node_by_id(4444)
        self.assertEqual(self.nodeCleaner.get_clean_node_name(node), "")

        self.assertEqual(self.graphClient.get_id_for_node(Node("lll", a=3)),
                         -1)
        self.assertEqual(self.graphClient.get_id_for_node(node), 4444)

    def test_get_shortest_path_to_name(self):
        name = self.graphClient.get_node_name_by_id(8000)
        subraph = self.graphClient.get_shortest_path_to_name_in_subgraph(
            444, name)
        print subraph

    def test_get_shortest_path(self):
        record_list = self.graphClient.get_shortest_path(444,
                                                         8000,
                                                         max_degree=2)
        self.assertEqual(0, count_record_list(record_list))

        record_list = self.graphClient.get_shortest_path(444, 8000)
        self.assertNotEqual(None, record_list)
        self.assertEqual(1, count_record_list(record_list))

        subgraph = self.graphClient.get_shortest_path_in_subgraph(444,
                                                                  8000,
                                                                  max_degree=2)
        self.assertEqual(None, subgraph)

        subgraph = self.graphClient.get_shortest_path_in_subgraph(444,
                                                                  8000,
                                                                  max_degree=6)
        self.assertNotEqual(None, subgraph)
        self.assertEqual(len(subgraph.nodes()), 7)
        self.assertEqual(len(subgraph.relationships()), 6)
        print subgraph

    def test_get_newest_nodes(self):
        node_list = self.graphClient.get_newest_nodes(10)
        self.assertEqual(10, len(node_list))
        print(node_list)
        graphJsonParser = GraphJsonParser()
        returns = graphJsonParser.parse_node_list_to_json(node_list)
        print(returns)