def markPos(): csvHeader = ["文件名","源文件词数","附码文件词数","词数差","差词占比"] for subDir in os.listdir(orgDir): #子目录 data = [] pathname = os.path.join(orgDir, subDir) if (os.path.isdir(pathname)): new_pathname = os.path.join(tagDir, subDir) mkdir(new_pathname) logFile = os.path.join(new_pathname,"log.csv") with open(logFile, "w", newline='') as f: # with open(birth_weight_file, "w") as f: writer = csv.writer(f) now = time.strftime("%Y-%m-%d %H:%M:%S") writer.writerows([[now]]) writer.writerows([csvHeader]) f.close() for filename in os.listdir(pathname): #遍历文件 if os.path.splitext(filename)[1][1:] =='txt': orgFile = os.path.join(pathname,filename) targetFile = os.path.join(new_pathname,filename) print(orgFile) orgCount,newCount = posTagging(orgFile,targetFile) if newCount==2: precent = 'ERROR' else: precent = abs(orgCount-newCount),str(round(abs(orgCount-newCount)/newCount*100))+'%' data.append([filename,orgCount,newCount,precent]) # with open(logFile,'a',newline='') as f: # writer = csv.writer(f) # writer.writerows([[filename,orgCount,newCount,abs(orgCount-newCount),str(round(abs(orgCount-newCount)/newCount*100))+'%']]) # f.close() with open(logFile,'a',newline='') as f: writer = csv.writer(f) writer.writerows(data) f.close()
def get_sentences(): data = [] orgDir = "附码语料库" toDir = "分句语料库" all_dirs, all_files, all_names = getdir(orgDir) for i in all_dirs: #创建子目录 mkdir(os.path.join(toDir,i)) for i in all_files: print(i) file_route = i.split('\\') file_route[0] = toDir file = "\\".join(file_route) data = splitSentence(i) #获取每个文章分句list save_file(data,file)
def extract_trunk(): orgDir = "分句语料库" toDir = "主干语料库" vbDir = "句首动词文件" # files=getdir(r'C:\Users\jee_s\Desktop\助研\分句语料库') all_dirs, all_files, all_names = getdir(orgDir) for i in all_dirs: #创建子目录 mkdir(os.path.join(toDir,i)) mkdir(os.path.join(vbDir,i)) for i in all_files: print(i) file_route = i.split('\\') file_route[0] = toDir file = "\\".join(file_route) file_route[0] = vbDir vbFile = "\\".join(file_route) data = readtxt(i) #len(data)=文章句数 data[i]为每一句以空格分割后的单词List pre_data,not_trunks = tran_sentense(data) print(i+"文件没有主干行数:",not_trunks) save_to_txt(pre_data, file, vbFile)
def subsection(): mkdir(sectionDir) logFile = os.path.join(sectionDir, "log.csv") with open(logFile, "w", newline='') as f: writer = csv.writer(f) now = time.strftime("%Y-%m-%d %H:%M:%S") # writer.writerows([[now]]) writer.writerows([csvHeader]) f.close() for subDir in os.listdir(tagDir): pathname = os.path.join(tagDir, subDir) if (os.path.isdir(pathname)): new_pathname = os.path.join(sectionDir, subDir) mkdir(new_pathname) for filename in os.listdir(pathname): if os.path.splitext(filename)[1] == '.txt': orgFile = os.path.join(pathname, filename) targetDir = os.path.join(new_pathname, filename.split('.')[0]) mkdir(targetDir) cutEssay(orgFile, targetDir, logFile)