def build_doc(pro_name, version): input_doc_collection_path = PathUtil.doc(pro_name=pro_name, version=version) output_pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way="code-pre") doc_collection: MultiFieldDocumentCollection = MultiFieldDocumentCollection.load( input_doc_collection_path) precess_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( preprocessor=CodeDocPreprocessor(), doc_collection=doc_collection) precess_doc_collection.save(output_pre_doc_collection_path)
def train_avg_w2v_model(pro_name, version): doc_path = PathUtil.doc(pro_name, version) collection = MultiFieldDocumentCollection.load(str(doc_path)) processor = CodeDocPreprocessor() pre_doc_collection = PreprocessMultiFieldDocumentCollection.create_from_doc_collection( processor, collection) pre_doc_path = PathUtil.pre_doc(pro_name, version, pre_way="code-pre") pre_doc_collection.save(pre_doc_path) word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v") AVGW2VFLModel.train(model_dir_path=word2vec_model_path, doc_collection=pre_doc_collection) return word2vec_model_path
def build_pre_doc(pro_name, version, preprocessor): pre_way = "unknown-pre" if isinstance(preprocessor, SimplePreprocessor): pre_way = "sim-pre" if isinstance(preprocessor, SpacyTextPreprocessor): pre_way = "spacy-pre" if isinstance(preprocessor, CodeDocPreprocessor): pre_way = "code-pre" if isinstance(preprocessor, PureCodePreprocessor): pre_way = "pure-pre" input_doc_collection_path = PathUtil.doc(pro_name=pro_name, version=version) output_pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way=pre_way) builder = CodeGraphBuilder() builder.build_pre_doc(input_doc_collection_path, output_pre_doc_collection_path, preprocessor)
def build_extra_model_and_doc(pro_name, version_list): for version in version_list: preprocessors = [CodeDocPreprocessor()] pre_way = "code-pre" build_doc(pro_name=pro_name, version=version) for preprocessor in preprocessors: build_pre_doc(pro_name=pro_name, version=version, preprocessor=preprocessor) train_name_searcher(pro_name=pro_name, version=version) pre_doc_collection_path = PathUtil.pre_doc(pro_name=pro_name, version=version, pre_way=pre_way) preprocess_doc_collection: PreprocessMultiFieldDocumentCollection = PreprocessMultiFieldDocumentCollection.load( pre_doc_collection_path) word2vec_model_path = PathUtil.sim_model(pro_name=pro_name, version=version, model_type="avg_w2v") AVGW2VFLModel.train(model_dir_path=word2vec_model_path, doc_collection=preprocess_doc_collection)
str(v) + ":" + str(k) for k, v in self.end_related_relation_num.items() ]) if __name__ == "__main__": domain_dir = PathUtil.domain_concept_dir("JabRef-2.6", version="v1") domain_dir = Path(domain_dir) term_save_path = str(domain_dir / "terms.txt") operation_save_path = str(domain_dir / "operations.txt") term_relation_save_path = str(domain_dir / "relations.json") linkage_save_path = str(domain_dir / "linkages.json") aliase_save_path = str(domain_dir / "aliases.json") pre_doc_collection_out_path = PathUtil.pre_doc(pro_name="JabRef-2.6", version="v2", pre_way="code-pre") reduce = ReduceDomainTerm(term_save_path, operation_save_path, term_relation_save_path, linkage_save_path, aliase_save_path, pre_doc_collection_out_path) delete_based_on_name = reduce.delete_based_on_name() print(delete_based_on_name) print(len(delete_based_on_name)) delete_based_on_aliase_tf = reduce.delete_based_on_aliase_tf() print(delete_based_on_aliase_tf) print(len(delete_based_on_aliase_tf)) delete_based_on_name_length = reduce.delete_based_on_name_length() print(delete_based_on_name_length) print(len(delete_based_on_name_length))