def start_fix_duplicate_domain_entity(): session = EngineFactory.create_session() domain_entity_name_list = DomainEntity.get_all_domain_entity_name_distinct( session) print("distinct name=%d" % len(domain_entity_name_list)) for team_domain_entity in domain_entity_name_list: name = team_domain_entity.name print("current name=%s" % name) if len(name) <= 2: print("delete name=%s" % name) all_same_name_domain_entity_list = DomainEntity.get_all_domain_entity_with_same_name( session, name) for domain_entity in all_same_name_domain_entity_list: delete_domain_entity_and_relation_to_sentence( session=session, domain_entity=domain_entity) continue all_same_name_domain_entity_list = DomainEntity.get_all_domain_entity_with_same_name( session, name) same_list_of_list = get_same_domain_entity_list( all_same_name_domain_entity_list) for same_list in same_list_of_list: merge_domain_entity_and_relation(session=session, same_list=same_list) session.commit() session.commit()
def test_search_POST(self): session = EngineFactory.create_so_session() searcher = SOPostSearcher(session) result = searcher.search_post("Json", 20) for post in result: print(post) self.assertEqual(20, len(result))
def test_get_annotation_by_index(self): session = EngineFactory.create_session() annotation = DocumentSentenceTextAnnotation.get_annotation_count_by_index( session, 1, 1) expected = -1 print annotation self.assertEqual(expected, annotation)
def test_full_text_search_in_nature_language_for_alias(self): session = EngineFactory.create_session() searcher = DBSearcher(session) result = searcher.full_text_search_in_nature_language("Json", APIAlias) for alias in result: print(alias) self.assertEqual(19, len(result))
def get_session(self): if not self.__session: ## todo, init the session from a factory instance,this factory is init in the construction self.__session = EngineFactory.create_session(autocommit=True, echo=False) return self.__session
def init(self, path="word2vec_api_software_wiki.txt", binary=True): self.session = EngineFactory.create_session() self.graphClient = DefaultGraphAccessor(GraphClient(server_number=4)) self.entity_vector_model = EntityVectorComputeModel() self.entity_vector_model.init_word2vec_model(path=path, binary=binary) print("init complete")
def test_full_text_searcher(self): session = EngineFactory.create_session(autocommit=True) searcher = GeneralConceptEntitySearcher(session=session) result = searcher.search("apk file") print(len(result)) print(result)
def test_full_text_search_for_os_answer(self): session = EngineFactory.create_session() searcher = SentenceSearcher(session) result = searcher.search_sentence_answer("java", 10) print(result) for line in result: print(line) self.assertEqual(10, len(result))
def test_full_text_search_for_domain_qa(self): session = EngineFactory.create_session() searcher = QA_FullText_EntityList(session) result = searcher.search_related_entity("java", 10) print(result) for line in result: print(line) self.assertEqual(10, len(result))
def init(self): self.session = EngineFactory.create_session() self.wikipedia_vector_map = EntityVectorModel.load("wikipedia.binary.txt", binary=True) self.domain_entity_vector_map = EntityVectorModel.load("domain_entity.binary.txt", binary=True) self.api_entity_vector_map = EntityVectorModel.load("api.binary.txt", binary=True) self.general_concept_searcher = GeneralConceptEntitySearcher(session=self.session) print("init complete")
def init(self): self.session = EngineFactory.create_session() self.df = pd.DataFrame(columns=[ 'sentence_id', 'doc_id', 'sentence_index', 'text', 'type' ]) self.df_same = pd.DataFrame(columns=[ 'sentence_id', 'doc_id', 'sentence_index', 'text', 'vote_type' ])
def test_API_aliases_searcher(self): session = EngineFactory.create_session() searcher = APISearcher(session) result = searcher.search_api_aliases("XML") for post in result: print(post) self.assertEqual(500, len(result))
def test_API_entity_searcher(self): session = EngineFactory.create_session() searcher = APISearcher(session) result = searcher.search_api_entity("XML", result_limit=20) for api in result: print(api) self.assertEqual(20, len(result))
def test_get_unfinished_doc_list(self): session = EngineFactory.create_session() unfinished_doc_list = DocumentAnnotationStatus.get_unfinished_doc_list(session=session) print unfinished_doc_list result = [] for each in unfinished_doc_list: result.append(each[0]) expected = [i for i in range(1, 94894)] self.assertEqual(expected, result)
def start_import(self, graphClient): self.logger = Logger(self.logger_file_name).get_log() if not self.session: self.session = EngineFactory.create_session() self.graphClient = graphClient all_relation_list = self.session.query(APIRelation).all() for api_relation in all_relation_list: self.import_one_relation(api_relation) print("import api entity relation complete")
def start_import(self, graphClient): self.logger = Logger(self.logger_file_name).get_log() if not self.session: self.session = EngineFactory.create_session() self.graphClient = graphClient all_apis = self.session.query(APIEntity).all() for api_entity in all_apis: self.import_one_api_entity(api_entity) print("import api entity complete")
def test_search(self): api_entity_session = EngineFactory.create_session(autocommit=True) api_searcher = APISearcher(session=api_entity_session, ) graph_client = GraphClient(server_number=1) search_util = SearchUtil(graph_client, api_searcher) result = search_util.search("string buffer", 10) print(result) self.assertEqual(len(result), 10)
def build_table(): session = EngineFactory.create_session() # 慎重清表# # EntityForQA.clear_table(session) sql_list=["MATCH (n:api) RETURN id(n),n", "MATCH(n:wikidata) RETURN id(n), n", "MATCH (n:`domain entity`) RETURN id(n), n.`domain_entity:name`,n.domain_entity_id"] label_list=["api","wikidata","domain entity"] for str, la in zip(sql_list, label_list): Neo4j2MySQL.neo4j_to_db(session, str, la)
def test_search_post_in_simple_format(self): session = EngineFactory.create_so_session() searcher = SOPostSearcher(session) result = searcher.search_post_in_simple_format("Json", 20) for post in result: print(post) print(post["id"]) print(post["score"]) print(post["title"]) self.assertEqual(20, len(result))
def test_exist_import_record(self): session = EngineFactory.create_session() jdk_method_knowledge_table = KnowledgeTableFactory.get_jdk_method_table(session) api_relation_table = KnowledgeTableFactory.get_api_relation_table(session) api_knowledge_table = KnowledgeTableFactory.get_api_entity_table(session) result= KnowledgeTableColumnMapRecord.exist_import_record(session, jdk_method_knowledge_table, api_relation_table, 1, "class_id") print result self.assertEqual(result,True)
def fix_the_valid_problem_for_paragraph_and_sentence(self): session = EngineFactory.create_session() # fix the problem of duplicate document all_invalid_document_list = session.query(DocumentText).filter_by( valid=0).all() count = 0 step = 3000 for invalid_document in all_invalid_document_list: all_invalid_paragraph_list = session.query( DocumentParagraphText).filter( DocumentParagraphText.doc_id == invalid_document.id).all() all_invalid_sentence_list = session.query( DocumentSentenceText).filter( DocumentSentenceText.doc_id == invalid_document.id).all() for paragraph in all_invalid_paragraph_list: paragraph.valid = 0 for sentence in all_invalid_sentence_list: sentence.valid = 0 count = count + 1 if count > step: session.commit() count = 0 session.commit() all_paragraph_list = session.query(DocumentParagraphText).filter_by( valid=1).all() for paragraph in all_paragraph_list: if paragraph.text == None or paragraph.text == "": paragraph.valid = 0 continue text = paragraph.text text = text.strip() if len(text) <= 3 or len(text.split(" ")) <= 2: paragraph.valid = 0 session.commit() all_sentence_list = session.query(DocumentSentenceText).filter_by( valid=1).all() for sentence in all_sentence_list: if sentence.text == None or sentence.text == "": sentence.valid = 0 continue text = sentence.text text = text.strip() if len(text) <= 3 or len(text.split(" ")) <= 2: sentence.valid = 0 session.commit()
def search_first_sentence_by_api_id(api_id): session = EngineFactory.create_session() sentence_id = session.query( SentenceToAPIEntityRelation.sentence_id).filter( SentenceToAPIEntityRelation.api_id == api_id).first()[0] if sentence_id: first_sentence = session.query(DocumentSentenceText.text).filter( DocumentSentenceText.id == sentence_id).first()[0] if first_sentence: return first_sentence else: return None else: return None
def start_import(self, graphClient): self.logger = Logger(self.logger_file_name).get_log() if not self.session: self.session = EngineFactory.create_session() self.graphClient = graphClient all_apis = self.session.query(APIEntity).all() for api_entity in all_apis: api_id = api_entity.id api_document_website_list = APIDocumentWebsite.get_document_website_list_by_api_id( self.session, api_id) self.import_document_website_to_one_entity( api_id, api_document_website_list) print("import api doc url complete")
def test_API_entity_searcher_in_tuple(self): session = EngineFactory.create_session() searcher = APISearcher(session) all_query_tuple = [ ("XML", APIEntity.API_TYPE_ALL_API_ENTITY, 427), ("json", APIEntity.API_TYPE_ALL_API_ENTITY, 10), ("http", APIEntity.API_TYPE_METHOD, 31), ] for query, api_type, size in all_query_tuple: result = searcher.search_api_entity(query, api_type=api_type) for api_entity in result: print(api_entity) self.assertEqual(size, len(result))
def init(self, vector_dir_path="./model/"): self.kg_models = KnowledgeGraphFeafureModels() self.kg_models.init(vector_dir_path=vector_dir_path) self._session = EngineFactory.create_session(echo=False) self._entity_extractor = EntityExtractor() # self._tf_idf_model = TFIDFModel() # self._tf_idf_model.load(dict_type=2) self.qa_searcher = QAEntitySearcher() client = GraphClient(server_number=4) self.semanticSearchAccessor = SemanticSearchAccessor(client) self.defaultAccessor = DefaultGraphAccessor(client) self._logger = Logger("QAResultSearch").get_log()
def start_import(self): self.session = EngineFactory.create_session() html_type = APIHTMLText.HTML_TYPE_API_DETAIL_DESCRIPTION api_entity_list = APIEntity.get_all_value_instance_api( session=self.session) for api_entity in api_entity_list: description = api_entity.short_description if description is None or description == "": continue api_html_entity = APIHTMLText(api_id=api_entity.id, html=description, html_type=html_type) api_html_entity.find_or_create(session=self.session, autocommit=False) self.session.commit()
def test_API_entity_searcher_in_tuple_by_limit(self): session = EngineFactory.create_session() searcher = APISearcher(session) all_query_tuple = [ ("XML", APIEntity.API_TYPE_ALL_API_ENTITY, 10, 10), ("json", APIEntity.API_TYPE_ALL_API_ENTITY, 10, 6), ("http", APIEntity.API_TYPE_METHOD, 20, 20), ("java", APIEntity.API_TYPE_CLASS, 15, 0), ] for query, api_type, limit, size in all_query_tuple: result = searcher.search_api_entity(query, api_type=api_type, result_limit=limit) for api_entity in result: print(api_entity) self.assertEqual(size, len(result))
def save_sentence_annotation(): session = EngineFactory.create_session() if not request.json: return "fail" j = request.json for each in j: if 'doc_id' not in each and "sentence_index" not in each and "type" not in each and "username" not in each: return "fail" doc_id = each["doc_id"] sentence_index = each["sentence_index"] type = each["type"] username = each["username"] sentence_text_annotation = DocumentSentenceTextAnnotation( doc_id, sentence_index, type, username) sentence_text_annotation.find_or_create(session, autocommit=False) if sentence_text_annotation.type != type: sentence_text_annotation.type = type session.commit() return "save successful"
def get_sentence_list(): if not request.json: return "fail" j = request.json if "os_question" not in j: return "fail" os = j["os_question"] session = EngineFactory.create_session() searcher = SentenceSearcher(session) sentence_data = searcher.search_sentence_answer(os, 10) sentence_list = [] if len(sentence_data) > 0: for each in sentence_data: text = each[1] doc_id = each[0] if text is not None and text != "": sentence_list.append({"doc_id": doc_id, "text": text}) result = {"sentence_list": sentence_list} return jsonify(result)
def get_test_node(self, label_name): temp_id_list = self.get_node_id_list(label_name) node_num = 0 id_list = [] while node_num < 500: example = random.sample(temp_id_list, 700) for id_num in example: name, des = self.get_node_by_id(id_num, label_name) if label_name == "domain entity": sentence = self.get_sentence_related_to_node(id_num, label_name) if len(sentence) == 0: print(id_num) continue elif name.endswith("(E)") or des == "#Null": continue id_list.append(id_num) node_num = len(example) id_list = id_list session = EngineFactory.create_session() general_concept_searcher = GeneralConceptEntitySearcher(session) node_list = [] for item in id_list: dic = {} dic["name"], full_des = self.get_node_by_id(item, label_name) dic["id"] = item dic["node_type"] = label_name # if not self.is_valid(dic['name'], general_concept_searcher): # continue if not self.api_is_valid(dic["name"],general_concept_searcher, session): continue if label_name == "domain entity": dic["sentence"] = self.get_sentence_related_to_node(item, label_name) if len(dic["sentence"]) == 0: continue else: dic["sentence"] = full_des.replace("#", "").replace("\r", "").replace("\n", "").encode("raw_unicode_escape") dic["related_wiki"] = self.get_wikidata_related_to_node(item, label_name) node_list.append(dic) return node_list