def pos_category(): for file in glob.glob(sys.argv[1] + '/*.txt'): print 'process %s...' %file path, input = os.path.split(file) idx = int(input.split('_')[1].split('.')[0]) #if idx <= 263: # continue output = os.path.join(sys.argv[2], input) g = open(output, 'w') with open(file, 'r') as f: for line in f: line_arr = line.strip().split(' ') #if line_arr[0] >= '1' and line_arr[0] <= '7' and (line[1] == ',' or line[1] == ' '): #id_list = line_arr[0].split(',') for i in range(len(line_arr)): if not line_arr[i].strip(): continue ans = segment(line_arr[i].strip()) # if get nothing / error if ans == 'Fail': print 'error' print line_arr[i].strip() continue for name, tp in ans: g.write(name.encode('utf-8')) g.write(':') g.write('%s\t' %tp) g.write('\n') g.close()
def parseFile(fileData): fileParse = [] for line in fileData: tokList = [''] for name, tp in segment(line): tokList.append(name.encode("UTF-8")) fileParse.append(tokList) return fileParse
def get_sentence_pos(): for file in glob.glob(sys.argv[1] + '/*.txt'): print 'process %s...' %file path, input = os.path.split(file) output = os.path.join(sys.argv[2], input) g = open(output, 'w') with open(file, 'r') as f: for line in f: if not line.strip(): continue print line ans = segment(line.strip()) for name, tp in ans: g.write(name.encode('utf-8')) g.write(':') g.write('%s\t' %tp) g.write('\n') g.close() return
vocab_list = [] label = 'test' for i in range(10): vocab_list.append({}) for file in glob.glob(label+'/*.txt'): #for file in glob.glob('*.txt'): with open(file, 'r') as f: print file for line in f: line_arr = line.strip().split(" ") if line_arr[0] >= '1' and line_arr[0] <= '7' and (line[1] == ',' or line[1] == ' '): id_list = line_arr[0].split(',') for i in range(1, len(line_arr)): #print(line_arr[i]) ans = segment(line_arr[i].strip()) #print(ans) for name, tp in ans: for j in id_list: if name in vocab_list[int(j)]: vocab_list[int(j)][name] += 1 else: vocab_list[int(j)][name] = 1 for i in range(1, 8): print "Dict in " + str(i) output = open(label+'_'+str(i)+'.txt', 'w') for key, val in vocab_list[i].items(): print key.encode("UTF-8"), val
# coding=UTF-8 from ai import segment import sys import glob if __name__ == '__main__': vocab_list = [] for i in range(10): vocab_list.append({}) for file in glob.glob(sys.argv[1] + '/*.txt'): with open(file, 'r') as f: print file for line in f: line_arr = line.split(" ") if line_arr[0] >= '1' and line_arr[0] <= '7' and (line[1] == ',' or line[1] == ' '): id_list = line_arr[0].split(',') for i in range(1, len(line_arr)): ans = segment(line_arr[i]) for name, tp in ans: for j in id_list: if name in vocab_list[int(j)]: vocab_list[int(j)][name] += 1 else: vocab_list[int(j)][name] = 1 for i in range(1, 8): print "Dict in " + str(i) for key, val in vocab_list[i].items(): print key.encode("UTF-8"), val print "\n"