def train_model(pro_name, version, first_model_config, second_model_config): document_collection_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load( str(document_collection_path)) processor = CodeDocPreprocessor() doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( processor, collection) sub_search_model_config = [ (PathUtil.sim_model(pro_name=pro_name, version=version, model_type=first_model_config[0]), first_model_config[1], first_model_config[2], False), (PathUtil.sim_model(pro_name=pro_name, version=version, model_type=second_model_config[0]), second_model_config[1], second_model_config[2], True), ] compound_model_name = "compound_{base_model}+{extra_model}".format( base_model=first_model_config[0], extra_model=second_model_config[0]) print("try to model compound model for %r" % compound_model_name) model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type=compound_model_name) model = CompoundSearchModel.train( model_dir_path=model_dir_path, doc_collection=doc_collection, sub_search_model_config=sub_search_model_config) return model_dir_path
def train_name_searcher(pro_name, version): print("train graph name searcher for %s at version %s" % (pro_name, version)) name_searcher_path = PathUtil.name_searcher(pro_name=pro_name, version=version) graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) searcher = KGNameSearcher.train_from_graph_data_file(graph_data_path=graph_data_path, node_info_factory=ProjectKGNodeInfoFactory()) searcher.save(name_searcher_path) print("finish... save to %s" % name_searcher_path)
def build_doc(pro_name, version): input_doc_collection_path = PathUtil.doc(pro_name=pro_name, version=version) output_pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way="code-pre") doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( input_doc_collection_path) precess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( preprocessor=CodeDocPreprocessor(), doc_collection=doc_collection) precess_doc_collection.save(output_pre_doc_collection_path)
def train_model(pro_name, version): document_collection_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load( str(document_collection_path)) processor = Preprocessor() doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( processor, collection) model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="bm25") BM25Model.train(model_dir_path, doc_collection=doc_collection) return model_dir_path
def build_v2_1_graph_for_pro(pro_name): builder = CodeGraphBuilder() input_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v2") print(input_graph_data_path) output_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v2_1") domain_concept_output_dir = PathUtil.domain_concept_dir(pro_name=pro_name, version="v2") builder.build_v2_graph(pro_name=pro_name, input_graph_data_path=input_graph_data_path, output_graph_data_path=output_graph_data_path, domain_concept_output_dir=domain_concept_output_dir)
def train_avg_w2v_model(pro_name, version): doc_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load(str(doc_path)) processor = CodeDocPreprocessor() pre_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( processor, collection) pre_doc_path = PathUtil.pre_doc(pro_name, version, pre_way="code-pre") pre_doc_collection.save(pre_doc_path) word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v") AVGW2VFLModel.train(model_dir_path=word2vec_model_path, doc_collection=pre_doc_collection) return word2vec_model_path
def create_search_model(pro_name, version, model_dir): sub_search_model_config_path = model_dir / "submodel.config" with open(sub_search_model_config_path, 'rb') as aq: sub_search_model_config = pickle.loads(aq.read()) model_1 = PathUtil.sim_model(pro_name, version, "avg_w2v") model_2 = PathUtil.sim_model(pro_name, version, "svm") new_sub_search_model_config = [ (model_1, sub_search_model_config[0][1], sub_search_model_config[0][2], sub_search_model_config[0][3]), (model_2, sub_search_model_config[1][1], sub_search_model_config[1][2], sub_search_model_config[1][3]), ] with open(sub_search_model_config_path, 'wb') as out: out.write(pickle.dumps(new_sub_search_model_config)) model = CompoundSearchModel.load(model_dir) return model
def build_pre_doc(pro_name, version, preprocessor): pre_way = "unknown-pre" if isinstance(preprocessor, SimplePreprocessor): pre_way = "sim-pre" if isinstance(preprocessor, SpacyTextPreprocessor): pre_way = "spacy-pre" if isinstance(preprocessor, CodeDocPreprocessor): pre_way = "code-pre" if isinstance(preprocessor, PureCodePreprocessor): pre_way = "pure-pre" input_doc_collection_path = PathUtil.doc(pro_name=pro_name, version=version) output_pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way=pre_way) builder = CodeGraphBuilder() builder.build_pre_doc(input_doc_collection_path, output_pre_doc_collection_path, preprocessor)
def build_extra_model_and_doc(pro_name, version_list): for version in version_list: preprocessors = [CodeDocPreprocessor()] pre_way = "code-pre" build_doc(pro_name=pro_name, version=version) for preprocessor in preprocessors: build_pre_doc(pro_name=pro_name, version=version, preprocessor=preprocessor) train_name_searcher(pro_name=pro_name, version=version) pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way=pre_way) preprocess_doc_collection: PreprocessMultiFieldDocumentCollection = PreprocessMultiFieldDocumentCollection.load( pre_doc_collection_path) word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v") AVGW2VFLModel.train(model_dir_path=word2vec_model_path, doc_collection=preprocess_doc_collection)
def main(self): username = "".join(get_user_name()) email = username + "@163.com" password = '******' answer_result = 'zhangsanfeng' self.send_keys("username", username) self.send_keys("password", password) self.send_keys("password2", password) self.send_keys("email", email) self.send_keys("aq_answer", answer_result) file_name = PathUtil.get_file_path("img/code.png") code_text = None count = 0 while code_text is None and count <= 10: count += 1 print("验证码流程开始") self.click("vcode_img") time.sleep(3) self.get_code_img( self._get_element("RegisterElement", "vcode_img"), file_name) code_text = get_code_text(file_name) print(f"第{count}次识别验证码,识别的结果为:{code_text}") print("验证码流程结束") if code_text is None: continue self.send_keys("vcode", code_text.strip()) self.click("login_btn") time.sleep(5) self.alert_accept() if self.is_home_page(): user_info = self._get_element( "HomeElement", "user_info").get_attribute("text") if user_info == username: print("登录成功") else: print("登录失败") file_path = PathUtil.get_file_path( "img/fail_screenshot.png") self.save_screenshot(file_path) break else: code_text = None
def build_v3_graph_for_pro(pro_name): builder = CodeGraphBuilder() input_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3") word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version="v3", model_type="avg_w2v") output_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3") generic_wikidata_item_cache_path = PathUtil.generic_wikidata_item_cache() wikidata_fusion_temp_result_dir = PathUtil.wikidata_fusion_temp_result_dir(pro_name) graph_data = builder.build_v3_graph_from_cache_simple(pro_name=pro_name, input_graph_data_path=input_graph_data_path, word2vec_model_path=word2vec_model_path, output_graph_data_path=output_graph_data_path, generic_title_search_cache_path=None, generic_wikidata_item_cache_path=generic_wikidata_item_cache_path, fusion_temp_result_dir=wikidata_fusion_temp_result_dir, ) graph_data.print_graph_info()
def __init__(self, pro_name, version): self.model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="svm") self.model = FilterSemanticTFIDFNode2VectorModel( name="svm", model_dir_path=self.model_dir_path) self.document_collection_path = PathUtil.doc(pro_name, version) self.collection = MultiFieldDocumentCollection.load( str(self.document_collection_path)) self.processor = Preprocessor() self.doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( self.processor, self.collection) self.pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name, version=version, weight="unweight") self.kg_name_searcher_path = PathUtil.name_searcher(pro_name, version) self.doc_sim_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v")
def main(self): testSuite = unittest.TestSuite() testSuite.addTest(RegisterCase('test_register_code_error')) testSuite.addTest(RegisterCase('test_login_success')) file_path = PathUtil.get_file_path("report/RegisterReport.html") f = open(file_path, 'wb') html_test = HTMLTestRunner(stream=f, title='RegisterReport', description='这是一个注册页面的报告') html_test.run(testSuite)
def build_v1_jdk(): jdk_kg_builder = JDKKGBuilder() pro_name = "jdk8" output_graph_data_path = PathUtil.jdk_graph_data() session = MYSQL_FACTORY.create_mysql_session_by_server_name( server_name="89Server", database="api_backup", echo=False) jdk_kg_builder.import_primary_type() api_id_to_node_id_map = jdk_kg_builder.import_jdk_from_api_table(session) id_map_file_path = PathUtil.jdk_api_node_map() with open(id_map_file_path, 'wb') as id_map_file: pickle.dump(api_id_to_node_id_map, id_map_file) jdk_kg_builder.save(output_graph_data_path) jdk_kg_builder.init_graph_data(output_graph_data_path) jdk_kg_builder.import_relation_from_jdk_table(session, api_id_to_node_id_map) jdk_kg_builder.save(output_graph_data_path) jdk_kg_builder.infer_extra_relation() jdk_kg_builder.build_aliases() jdk_kg_builder.add_source_label(pro_name) jdk_kg_builder.save(output_graph_data_path)
def build_v2_graph_for_pro(pro_name): graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v1") graph_data: GraphData = GraphData.load(graph_data_path) new_graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v2") res = ExtractResultImport(graph_data, new_graph_data_path, 2) data_dir = Path(OUTPUT_DIR) / "graph" / "jdk8" / "filter_data" data_dir.mkdir(parents=True, exist_ok=True) filter_sentence_path = str(data_dir / "filter_sentence.txt") pat = re.compile('<[^>]+>', re.S) print("start to add sentences...") for id in graph_data.get_node_ids(): node_info = graph_data.get_node_info_dict(id) short_description = node_info["properties"].get( "short_description", "") if not short_description: continue short_description = pat.sub('', short_description) short_descs = sent_tokenize(short_description) for short_desc in short_descs: short_desc = " ".join(short_desc.split()) str_rm_sign = classifier.preprocessor.remove_sign(short_desc) text = classifier.preprocessor.remove_stop_words(str_rm_sign) label = list(classifier.predict(text))[0] if label == "0": print(short_desc) with open(filter_sentence_path, "a", encoding='utf-8') as f: f.write(short_desc) f.write("\n") continue else: res.add_sentence_relation(short_desc, id, int(label)) res.save_new_graph_data()
def train_model(pro_name, version, weight): document_collection_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load(str(document_collection_path)) processor = CodeDocPreprocessor() doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection(processor, collection) graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) pretrain_node2vec_path = PathUtil.node2vec(pro_name=pro_name, version=version, weight=weight) embedding_size = 100 kg_name_searcher_path = PathUtil.name_searcher(pro_name=pro_name, version=version) model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_n2v") model = AVGNode2VectorModel.train(model_dir_path=model_dir_path, doc_collection=doc_collection, embedding_size=embedding_size, pretrain_node2vec_path=pretrain_node2vec_path, graph_data_path=graph_data_path, kg_name_searcher_path=kg_name_searcher_path, ) return model_dir_path
def __init__(self): self.logger = logging.getLogger() self.logger.setLevel(logging.DEBUG) log_name = datetime.datetime.now().strftime("%Y-%m-%d") + ".log" log_path = PathUtil.get_file_path(f"logs/{log_name}") # 文件输出 self.file_handle = logging.FileHandler(log_path, encoding='utf-8') formatter = logging.Formatter( '%(asctime)s %(filename)s %(funcName)s %(levelno)s %(levelname)s %(message)s' ) self.file_handle.setFormatter(formatter) self.file_handle.setLevel(logging.INFO) self.logger.addHandler(self.file_handle)
def run_main(self): file_path = PathUtil.get_file_path("conf/keyword.xls") excel = ExcelUtil(file_path) rows = excel.get_lines() action_method = ActionMethod() for i in range(1, rows): is_run = excel.get_cell_value(i, 1) if is_run == 'Y': node_element = excel.get_cell_value(i, 3) exec_method = excel.get_cell_value(i, 4) send_value = excel.get_cell_value(i, 5) action_element = excel.get_cell_value(i, 6) except_result_method = excel.get_cell_value(i, 7) except_result = excel.get_cell_value(i, 8) self.run_method(action_method, exec_method, node_element, action_element, send_value) if except_result_method != '' and except_result != '': result = self.exec_result_method(action_method, except_result_method) if except_result in result: excel.write_cell_value(i, 9, 'pass') else: excel.write_cell_value(i, 9, 'fail')
def __init__(self, pro_name, version, model_dir): graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) self.graph_data: GraphData = GraphData.load(graph_data_path) self.model = self.create_search_model(pro_name, version, model_dir) print("It's ok for init!")
def __init__(self, driver): self.driver = driver self.register_handle = RegisterHandle(driver) self.file_name = PathUtil.get_file_path("img/code.png")
from script.summary.generate_summary import Summary from util.path_util import PathUtil if __name__ == '__main__': pro_name = "jdk8" version = "v3_1" compound_model_name = "compound_{base_model}+{extra_model}".format( base_model="avg_w2v", extra_model="svm") model_dir = PathUtil.sim_model(pro_name=pro_name, version=version, model_type=compound_model_name) summary = Summary(pro_name, version, model_dir) while True: query = input("please input query:") class_name = input("please input qualified class name") all_class_2_summary = summary.get_summary(query, class_name) for index, item in all_class_2_summary.items(): print(index, item)
from sekg.graph.exporter.graph_data import GraphData from sekg.ir.models.compound import CompoundSearchModel from util.path_util import PathUtil if __name__ == '__main__': pro_name = "jdk8" version = "v3" compound_model_name = "compound_{base_model}+{extra_model}".format( base_model="avg_w2v", extra_model="svm") model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type=compound_model_name) model = CompoundSearchModel.load(model_dir_path) graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) graph_data: GraphData = GraphData.load(graph_data_path) valid_class_ids = graph_data.get_node_ids_by_label("class") valid_class_ids = valid_class_ids - graph_data.get_node_ids_by_label( "class type") valid_method_ids = graph_data.get_node_ids_by_label("method") valid_method_ids.update( graph_data.get_node_ids_by_label("base override method")) valid_sentence_ids = graph_data.get_node_ids_by_label("sentence") while True: query = input("please input query: ") select = int(input("1、class; 2、methos; 3、sentence")) top_num = int(input("please input top num")) result = [] if select == 1: result = model.search(query=query, top_num=top_num, valid_doc_id_set=valid_class_ids)
EntityReader.write_line_data( str(Path(domain_dir) / "mention_num.txt"), [str(v) + ":" + str(k) for k, v in self.mention_time.items()]) EntityReader.write_line_data( str(Path(domain_dir) / "sum_mention_time.txt"), [str(v) + ":" + str(k) for k, v in self.sum_mention_time.items()]) EntityReader.write_line_data( str(Path(domain_dir) / "end_related_relation_num.txt"), [ str(v) + ":" + str(k) for k, v in self.end_related_relation_num.items() ]) if __name__ == "__main__": domain_dir = PathUtil.domain_concept_dir("JabRef-2.6", version="v1") domain_dir = Path(domain_dir) term_save_path = str(domain_dir / "terms.txt") operation_save_path = str(domain_dir / "operations.txt") term_relation_save_path = str(domain_dir / "relations.json") linkage_save_path = str(domain_dir / "linkages.json") aliase_save_path = str(domain_dir / "aliases.json") pre_doc_collection_out_path = PathUtil.pre_doc(pro_name="JabRef-2.6", version="v2", pre_way="code-pre") reduce = ReduceDomainTerm(term_save_path, operation_save_path, term_relation_save_path, linkage_save_path, aliase_save_path, pre_doc_collection_out_path) delete_based_on_name = reduce.delete_based_on_name()
# coding=utf-8 from selenium import webdriver from business.register_ddt_business import RegisterDDTBusiness import unittest from util.html_test_runner import HTMLTestRunner from ddt import * from util.excel_util import ExcelUtil from util.path_util import PathUtil excel_util = ExcelUtil(PathUtil.get_file_path("conf/case.xls")) datas = excel_util.get_data() @ddt class RegisterDDTCase(unittest.TestCase): def setUp(self): self.driver = webdriver.Chrome() self.driver.get("http://www.yundama.com/index/reg") self.driver.maximize_window() self.register_business = RegisterDDTBusiness(self.driver) def tearDown(self): self.driver.close() @data(*datas) def test_register_success(self, datas): username, password, password2, email, answer, code, assert_text = datas self.assertTrue( self.register_business.register_main(username, password, password2, email, answer, code, assert_text), assert_text)
from sekg.graph.exporter.graph_data import GraphData from sekg.ir.models.n2v.svm.avg_n2v import AVGNode2VectorModel from util.path_util import PathUtil if __name__ == '__main__': pro_name = "jdk8" version = "v3" model_dir_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_n2v") model = AVGNode2VectorModel.load(model_dir_path) graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) graph_data: GraphData = GraphData.load(graph_data_path) valid_class_ids = graph_data.get_node_ids_by_label("class") valid_class_ids = valid_class_ids - graph_data.get_node_ids_by_label( "class type") valid_method_ids = graph_data.get_node_ids_by_label("method") valid_method_ids.update( graph_data.get_node_ids_by_label("base override method")) valid_sentence_ids = graph_data.get_node_ids_by_label("sentence") while True: query = input("please input query: ") select = int(input("1、class; 2、methos; 3、sentence")) top_num = int(input("please input top num")) result = [] if select == 1: result = model.search(query=query, top_num=top_num, valid_doc_id_set=valid_class_ids) elif select == 2:
from sekg.graph.exporter.graph_data import GraphData from sekg.ir.models.n2v.svm.filter_semantic_tfidf_n2v import FilterSemanticTFIDFNode2VectorModel from util.path_util import PathUtil if __name__ == '__main__': model_dir_path = PathUtil.sim_model(pro_name="jdk8", version="v3", model_type="svm") model = FilterSemanticTFIDFNode2VectorModel.load(model_dir_path) graph_data_path = PathUtil.graph_data(pro_name="jdk8", version="v3") graph_data: GraphData = GraphData.load(graph_data_path) valid_class_ids = graph_data.get_node_ids_by_label("class") valid_class_ids = valid_class_ids - graph_data.get_node_ids_by_label( "class type") valid_method_ids = graph_data.get_node_ids_by_label("method") valid_method_ids.update( graph_data.get_node_ids_by_label("base override method")) valid_sentence_ids = graph_data.get_node_ids_by_label("sentence") while True: query = input("please input query: ") select = int(input("1、class; 2、methos; 3、sentence")) top_num = int(input("please input top num")) result = [] if select == 1: result = model.search(query=query, top_num=top_num, valid_doc_id_set=valid_class_ids) elif select == 2: result = model.search(query=query, top_num=top_num,
for i in range(self.get_lines()): data.append(self.book_sheet.row_values(i)) return data def get_lines(self): return self.book_sheet.nrows # 获取单元格的数据 def get_cell_value(self, row, col): if self.get_lines() > row: return self.book_sheet.cell_value(row, col) else: return None # 写入数据 def write_cell_value(self, row, col, value): data = xlrd.open_workbook(self.file_path) write_data = copy(data) sheet = write_data.get_sheet(self.index) sheet.write(row, col, value) write_data.save(self.file_path) if __name__ == '__main__': file_path = PathUtil.get_file_path("conf/keyword.xls") execl = ExcelUtil(file_path) data = execl.get_cell_value(1, 4) print(data) # execl.write_cell_value(1, 6, "hello world")
from sekg.graph.accessor import GraphAccessor from sekg.graph.exporter.graph_data import Neo4jImporter, GraphData from definitions import GRAPH_FACTORY from util.path_util import PathUtil if __name__ == "__main__": import_projects = [("jdk8", "87Neo4jApiSummaryJdk")] for pro_name, server_name in import_projects: graph_client = GRAPH_FACTORY.create_py2neo_graph_by_server_name( server_name=server_name) accessor = GraphAccessor(graph_client) importer = Neo4jImporter(accessor) graph_data_path = PathUtil.graph_data(pro_name=pro_name, version="v3") graph_data: GraphData = GraphData.load(graph_data_path) print("start import data of {} into neo4j".format(pro_name)) importer.import_all_graph_data(graph_data)
def __init__(self, file_name=None): if not file_name: file_name = PathUtil.get_file_path("conf/element.ini") self.cf = self._load_ini(file_name)
def build_doc(pro_name, version): graph_data_path = PathUtil.graph_data(pro_name=pro_name, version=version) document_collection_path = PathUtil.doc(pro_name=pro_name, version=version) builder = CodeGraphBuilder() builder.build_doc(graph_data_path=graph_data_path, output_doc_collection_path=document_collection_path)