def getsents(self, type="java"): self.logger.info("getsents") import json # 存储 diff1的语料处理后的模型 if not os.path.exists( os.path.join(GetFilePathRoot.get_root_dir(), "data5")): os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(), "data5")) # 记录原始文件和模型提取结果的映射 self.sha_name_file = os.path.join(GetFilePathRoot.get_root_dir(), "data2", "extract_sha_name_csv") i = 0 self.logger.info(i) for name, file in self.getfile(type): i += 1 self.logger.info((i, name)) sents = file.replace("\r\n", "\n").replace("\r", "\n").split("\n") ## 进行处理 dealsents = self.getpredeal(sents=sents) result = self.getexcate(dealsents) # 保存的文件 csv with open(self.sha_name_file, "a") as f: ff = csv.writer(f) ff.writerow([self.getsha(name), name]) f.close() if os.path.exists( os.path.join(GetFilePathRoot.get_root_dir(), "data5", self.getsha(name) + ".json")): continue with open( os.path.join(GetFilePathRoot.get_root_dir(), "data5", self.getsha(name)) + ".json", "w") as dump_f: json.dump(result, dump_f) dump_f.close() self.log(ExtractModeTask.EXTRACT) return
def loaddiff(self): corpus_root = str(os.path.join(GetFilePathRoot.get_root_dir(), "data")) ##中文目录乱码 corpus_root = unicode(corpus_root, "GB2312") self.logger.info(corpus_root) pattern_1 = r".*/diff1/.*\.txt" self.logger.info("加载语料库 lazyload") self.diff1 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_1) self.logger.info("加载语料库 完毕")
def getsents(self,type="java"): self.logger.info("getsents") if not os.path.exists(os.path.join(GetFilePathRoot.get_root_dir(),"data3")): os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(),"data3")) self.sha_name_file=os.path.join(GetFilePathRoot.get_root_dir(),"data2","DealCorpus_sha_name_csv") i=0 self.logger.info(i) for name,file in self.getfile(type): i+=1 self.logger.info((i,name)) sents= file.replace("\r\n","\n").replace("\r","\n").split("\n") result = self.getpredeal(sents=sents) with open(self.sha_name_file, "a") as f: ff = csv.writer(f) ff.writerow([self.getsha(name), name]) f.close() if os.path.exists(os.path.join(GetFilePathRoot.get_root_dir(),"data3",self.getsha(name))): continue with open(os.path.join(GetFilePathRoot.get_root_dir(),"data3",self.getsha(name)),"w") as f: f.write(result) f.close() self.log(DealCorpusTask.JAVA_SANIT)
def initlog(self): if not os.path.exists( os.path.join(GetFilePathRoot.get_root_dir(), "data2")): os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(), "data2")) self.outputfile = os.path.join(GetFilePathRoot.get_root_dir(), "data2", "CorpusOutput.txt") import logging # 创建一个logger self.logger = logging.getLogger("corpus") self.logger.setLevel(logging.DEBUG) # 创建一个handler,用于写入日志文件 fh = logging.FileHandler(self.outputfile) fh.setLevel(logging.DEBUG) # 再创建一个handler,用于输出到控制台 ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # 定义handler的输出格式 formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) # 给logger添加handler self.logger.addHandler(fh) self.logger.addHandler(ch)
def mergecorpus(self,type="java"): if type =="java": rootDir=os.path.join(GetFilePathRoot.get_root_dir(),"data3") targetfile=self.merge_corpus_java_file i = 0 num=len(os.listdir(rootDir)) k = open(targetfile, 'w') for lists in os.listdir(rootDir): path = os.path.join(rootDir, lists) i+=1 self.logger.info((float(i)/num,path)) with open(path,"r") as f: # print f.read() k.write(f.read()+"\n") k.close() self.log(DealCorpusTask.JAVA_MERGE) pass
def loadcorpus(self): corpus_root = str(os.path.join(GetFilePathRoot.get_root_dir(), "data")) ##中文目录乱码 corpus_root = unicode(corpus_root, "GB2312") self.logger.info(corpus_root) pattern_1 = r".*/diff1/.*\.txt" pattern_2 = r".*/diff2/.*\.txt" pattern_3 = r".*/diff3/.*\.txt" from nltk.corpus.util import LazyCorpusLoader from nltk.corpus import PlaintextCorpusReader self.logger.info("加载语料库") self.diff1 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_1) self.diff2 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_2) self.diff3 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader, pattern_3) self.logger.info("加载完毕")
def initlog(self): self.logfile = os.path.join(GetFilePathRoot.get_root_dir(),"data2","DealCorpus.txt") print self.logfile self.java_dict =False self.java_model=False self.java_pic= False self.java_predeal =False self.java_sanit =False self.xml_dict = False self.xml_model = False self.xml_pic = False self.xml_predeal = False self.xml_sanit = False self.log(None) self.outputfile = os.path.join(GetFilePathRoot.get_root_dir(),"data2","DealCorpusOutput.txt") if not os.path.exists(os.path.join(GetFilePathRoot.get_root_dir(), "data4")): os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(), "data4")) self.merge_corpus_java_file = os.path.join(GetFilePathRoot.get_root_dir(), "data4", "merge_corpus_java.txt") self.dict_corpus_java_file=os.path.join(GetFilePathRoot.get_root_dir(),"data4","java_dict_corpus.txt") self.model_corpus_java_file=os.path.join(GetFilePathRoot.get_root_dir(),"data4","model_corpus_java") import logging # 创建一个logger self.logger = logging.getLogger("DealCorpus") self.logger.setLevel(logging.DEBUG) # 创建一个handler,用于写入日志文件 fh = logging.FileHandler(self.outputfile) fh.setLevel(logging.DEBUG) # 再创建一个handler,用于输出到控制台 ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # 定义handler的输出格式 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) # 给logger添加handler self.logger.addHandler(fh) self.logger.addHandler(ch)
def initlog(self): self.logfile = os.path.join(GetFilePathRoot.get_root_dir(), "data2", "ExtractModel.txt") print self.logfile self.merge = False self.extract = False self.start_prefix = "##### start" self.change_prefix = "###### change :" self.name_prefix = "###### name :" self.end_prefix = "##### end" self.model_corpus_java_file = os.path.join( GetFilePathRoot.get_root_dir(), "data4", "model_corpus_java") modelfile = self.model_corpus_java_file self.model = word2vec.Word2Vec.load(modelfile) self.log(None) self.outputfile = os.path.join(GetFilePathRoot.get_root_dir(), "data2", "ExtractModelOutput.txt") if not os.path.exists( os.path.join(GetFilePathRoot.get_root_dir(), "data4")): os.mkdir(os.path.join(GetFilePathRoot.get_root_dir(), "data4")) self.merge_extract_java_file = os.path.join( GetFilePathRoot.get_root_dir(), "data4", "merge_extract_java.txt") import logging # 创建一个logger self.logger = logging.getLogger("DealCorpus") self.logger.setLevel(logging.DEBUG) # 创建一个handler,用于写入日志文件 fh = logging.FileHandler(self.outputfile) fh.setLevel(logging.DEBUG) # 再创建一个handler,用于输出到控制台 ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # 定义handler的输出格式 formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) # 给logger添加handler self.logger.addHandler(fh) self.logger.addHandler(ch)