Ejemplo n.º 1
0
def markPos():
	csvHeader = ["文件名","源文件词数","附码文件词数","词数差","差词占比"]
	for subDir in os.listdir(orgDir):  #子目录
		data = []
		pathname = os.path.join(orgDir, subDir)
		if (os.path.isdir(pathname)):
			new_pathname = os.path.join(tagDir, subDir)
			mkdir(new_pathname)
			logFile = os.path.join(new_pathname,"log.csv")
			with open(logFile, "w", newline='') as f:
			        # with open(birth_weight_file, "w") as f:
			        writer = csv.writer(f) 
			        now = time.strftime("%Y-%m-%d %H:%M:%S")
			        writer.writerows([[now]])
			        writer.writerows([csvHeader])
			        f.close()
			for filename in os.listdir(pathname): #遍历文件
				if os.path.splitext(filename)[1][1:] =='txt':
					orgFile = os.path.join(pathname,filename)
					targetFile = os.path.join(new_pathname,filename)
					print(orgFile)
					orgCount,newCount = posTagging(orgFile,targetFile)
					if newCount==2:
						precent = 'ERROR'
					else:
						precent = abs(orgCount-newCount),str(round(abs(orgCount-newCount)/newCount*100))+'%'
					data.append([filename,orgCount,newCount,precent])
					# with open(logFile,'a',newline='') as f:
					# 	writer = csv.writer(f)
					# 	writer.writerows([[filename,orgCount,newCount,abs(orgCount-newCount),str(round(abs(orgCount-newCount)/newCount*100))+'%']])
					# 	f.close()
			with open(logFile,'a',newline='') as f:
				writer = csv.writer(f)
				writer.writerows(data)
				f.close()
Ejemplo n.º 2
0
def get_sentences():
    data = []
    orgDir = "附码语料库"
    toDir  = "分句语料库"
    all_dirs, all_files, all_names = getdir(orgDir)
    for i in all_dirs: #创建子目录
        mkdir(os.path.join(toDir,i)) 
    for i in all_files:
        print(i)
        file_route = i.split('\\')
        file_route[0] = toDir
        file = "\\".join(file_route)
        data = splitSentence(i) #获取每个文章分句list
        save_file(data,file)
Ejemplo n.º 3
0
def extract_trunk():
    orgDir = "分句语料库"
    toDir  = "主干语料库"
    vbDir  = "句首动词文件"
    # files=getdir(r'C:\Users\jee_s\Desktop\助研\分句语料库')
    all_dirs, all_files, all_names = getdir(orgDir)
    for i in all_dirs: #创建子目录
        mkdir(os.path.join(toDir,i))
        mkdir(os.path.join(vbDir,i))
    for i in all_files:
        print(i)
        file_route = i.split('\\')
        file_route[0] = toDir
        file = "\\".join(file_route)
        file_route[0] = vbDir
        vbFile = "\\".join(file_route)
        data = readtxt(i) #len(data)=文章句数  data[i]为每一句以空格分割后的单词List
        pre_data,not_trunks = tran_sentense(data)
        print(i+"文件没有主干行数:",not_trunks)

        save_to_txt(pre_data, file, vbFile)
Ejemplo n.º 4
0
def subsection():
    mkdir(sectionDir)
    logFile = os.path.join(sectionDir, "log.csv")
    with open(logFile, "w", newline='') as f:
        writer = csv.writer(f)
        now = time.strftime("%Y-%m-%d %H:%M:%S")
        # writer.writerows([[now]])
        writer.writerows([csvHeader])
        f.close()

    for subDir in os.listdir(tagDir):
        pathname = os.path.join(tagDir, subDir)
        if (os.path.isdir(pathname)):
            new_pathname = os.path.join(sectionDir, subDir)
            mkdir(new_pathname)
            for filename in os.listdir(pathname):
                if os.path.splitext(filename)[1] == '.txt':
                    orgFile = os.path.join(pathname, filename)
                    targetDir = os.path.join(new_pathname,
                                             filename.split('.')[0])
                    mkdir(targetDir)
                    cutEssay(orgFile, targetDir, logFile)