Beispiel #1
0
def pos_category():
	for file in glob.glob(sys.argv[1] + '/*.txt'):
		print 'process %s...' %file
		path, input = os.path.split(file)
		idx = int(input.split('_')[1].split('.')[0])
		#if idx <= 263:
		#	continue
		output = os.path.join(sys.argv[2], input)
		g = open(output, 'w')
		with open(file, 'r') as f:
			for line in f:
				line_arr = line.strip().split(' ')
				#if line_arr[0] >= '1' and line_arr[0] <= '7' and (line[1] == ',' or line[1] == ' '):
					#id_list = line_arr[0].split(',')
				for i in range(len(line_arr)):
					if not line_arr[i].strip():
						continue
					ans = segment(line_arr[i].strip())
					# if get nothing / error
					if ans == 'Fail':
						print 'error'
						print line_arr[i].strip()
						continue
					for name, tp in ans:
						g.write(name.encode('utf-8'))
						g.write(':')
						g.write('%s\t' %tp)
					g.write('\n')
		g.close()
Beispiel #2
0
def parseFile(fileData):
    fileParse = []
    for line in fileData:
        tokList = ['']
        for name, tp in segment(line):
            tokList.append(name.encode("UTF-8"))
        fileParse.append(tokList)
    return fileParse
Beispiel #3
0
def get_sentence_pos():
	for file in glob.glob(sys.argv[1] + '/*.txt'):
		print 'process %s...' %file
		path, input = os.path.split(file)
		output = os.path.join(sys.argv[2], input)
		g = open(output, 'w')
		with open(file, 'r') as f:
			for line in f:
				if not line.strip():
					continue
				print line
				ans = segment(line.strip())
				for name, tp in ans:
					g.write(name.encode('utf-8'))
					g.write(':')
					g.write('%s\t' %tp)
				g.write('\n')
			g.close()
	return
    vocab_list = []

    label = 'test'
    for i in range(10):
        vocab_list.append({})
    for file in glob.glob(label+'/*.txt'):
    #for file in glob.glob('*.txt'):
        with open(file, 'r') as f:
            print file
            for line in f:
                line_arr = line.strip().split(" ")
                if line_arr[0] >= '1' and line_arr[0] <= '7' and (line[1] == ',' or line[1] == ' '):
                    id_list = line_arr[0].split(',')
                    for i in range(1, len(line_arr)):
                        #print(line_arr[i])
                        ans = segment(line_arr[i].strip())
                        #print(ans)
                        for name, tp in ans:
                            for j in id_list:
                                if name in vocab_list[int(j)]:
                                    vocab_list[int(j)][name] += 1
                                else:
                                    vocab_list[int(j)][name] = 1


    for i in range(1, 8):
        print "Dict in " + str(i)
        output = open(label+'_'+str(i)+'.txt', 'w')

        for key, val in vocab_list[i].items():
            print key.encode("UTF-8"), val
Beispiel #5
0
# coding=UTF-8
from ai import segment
import sys
import glob

if __name__ == '__main__':
    vocab_list = []
    for i in range(10):
        vocab_list.append({})
    for file in glob.glob(sys.argv[1] + '/*.txt'):
        with open(file, 'r') as f:
            print file
            for line in f:
                line_arr = line.split(" ")
                if line_arr[0] >= '1' and line_arr[0] <= '7' and (line[1] == ',' or line[1] == ' '):
                    id_list = line_arr[0].split(',')
                    for i in range(1, len(line_arr)):
                        ans = segment(line_arr[i])
                        for name, tp in ans:
                            for j in id_list:
                                if name in vocab_list[int(j)]:
                                    vocab_list[int(j)][name] += 1
                                else:
                                    vocab_list[int(j)][name] = 1

    for i in range(1, 8):
        print "Dict in " + str(i)
        for key, val in vocab_list[i].items():
            print key.encode("UTF-8"), val
        print "\n"