ws = c.split() ws = map(strip, ws) dic = Dic() dic.from_list(ws) dic.tofile(tpath) def gen_2gram_dic(fpath, tpath): print '.. dic: fpath>tpath', fpath, tpath with open(fpath) as f: c = f.read() ws = c.split() ws = map(strip, ws) two_grams = [] for i in range(len(ws) - 1): wp = "%s-%s" % (ws[i], ws[i + 1]) #print wp two_grams.append(wp) dic = Dic() dic.from_list(two_grams) dic.tofile(tpath) if __name__ == '__main__': ngram, fpath, tpath = args_check(3, "cmd [ngram] [fpath], [tpath]") if int(ngram) == 1: gen_dic(fpath, tpath) else: gen_2gram_dic(fpath, tpath)
self.add_lst(datas) def test_scan(self): num_lines = get_num_lines(self.fpath) #self.lst = [ [] for i in range(num_lines)] labels = [1 for i in range(num_lines)] self.set_labels(labels) with open(self.fpath) as f: datas = [] for line in f.readlines(): ws = line.split() ws = map(strip, ws) ws = map(int, ws) datas.append(ws) self.add_lst(datas) if __name__ == '__main__': args = sys.argv[1:] _type = args[0] if _type == "word": _type, label_path, fpath, tpath = args_check(4, "cmd [label_path] [fpath] [tpath]") w = WordLinear(label_path, fpath, tpath) w() elif _type == "test": _type, fpath, tpath = args_check(3, "cmd [label_path] [fpath] [tpath]") w = WordLinear(None, fpath, tpath) w(False)
with open(env.SPLITED_SOURCE_TITLES_PH) as sf: with open(env.SPLITED_TARGET_TITLES_PH) as tf: for i in range(num_rcds): source = sf.readline().decode('gbk','ignore').encode('utf8') target = tf.readline().decode('gbk','ignore').encode('utf8') source = source.strip().split() target = target.strip().split() s_rcds = [dic.get(w) for w in source] t_rcds = [dic.get(w) for w in target] #print s_rcds, t_rcds records.append( ' '.join(map(str, s_rcds)) + '\t' + ' '.join(map(str, t_rcds)) ) with open(env.WORD_ID_SOURCE_TARGET_PH, 'w') as f: f.write('\n'.join(records)) if __name__ == "__main__": args = args_check(1, "cmd [action]") #record2db() actions = { 'record2db' : record2db, 'gen_target_title_file': gen_target_title_file, 'gen_source_title_file': gen_source_title_file, 'gen_word_dic': gen_word_dic, 'gen_word_id_titles': gen_word_id_titles, } action = args[0] actions[action]()
self.trans() self.tofile() def trans(self): num_lines = get_num_lines(self.test_ph) self.lines = [] with open(self.fph) as resf: with open(self.test_ph) as testf: for i in range(num_lines): #print i res = resf.readline() tes = testf.readline() label = res.strip() label = self.formats.get(label, int(label)) if label is None: break #print 'label:', label line = "%d\t%s" % (label, tes.strip()) self.lines.append(line) def tofile(self): with open(self.tph, 'w') as f: f.write('\n'.join(self.lines)) if __name__ == "__main__": fph, test_ph, tph = args_check(3, "") g = Gen(fph, test_ph, tph) g()
print 'num of rcds:', num_rcds records = [] with open(env.SPLITED_SOURCE_TITLES_PH) as sf: with open(env.SPLITED_TARGET_TITLES_PH) as tf: for i in range(num_rcds): source = sf.readline().decode('gbk', 'ignore').encode('utf8') target = tf.readline().decode('gbk', 'ignore').encode('utf8') source = source.strip().split() target = target.strip().split() s_rcds = [dic.get(w) for w in source] t_rcds = [dic.get(w) for w in target] #print s_rcds, t_rcds records.append(' '.join(map(str, s_rcds)) + '\t' + ' '.join(map(str, t_rcds))) with open(env.WORD_ID_SOURCE_TARGET_PH, 'w') as f: f.write('\n'.join(records)) if __name__ == "__main__": args = args_check(1, "cmd [action]") #record2db() actions = { 'record2db': record2db, 'gen_target_title_file': gen_target_title_file, 'gen_source_title_file': gen_source_title_file, 'gen_word_dic': gen_word_dic, 'gen_word_id_titles': gen_word_id_titles, } action = args[0] actions[action]()
Created on Aug 9, 2013 @author: Chunwei Yan @ pkusz @mail: [email protected] ''' from __future__ import division import os import sys sys.path.append('..') from utils import args_check import env def split_word(fpath, tpath): data_dir = os.path.join("/home/chunwei/trunk", "tools/libTCWordSeg3.4.0/data") program = os.path.join(env.PROJECT_PATH,'TCWordSeg') cmd = ' '.join([ program, data_dir, fpath, tpath, ]) print os.popen(cmd) if __name__ == "__main__": ppath, wpath = args_check(2, "cmd [paragraph_path] [words_path]") split_word(ppath, wpath) #remove_stop_words(wpath)
@mail: [email protected] ''' from __future__ import division import os import sys sys.path.append('..') from utils import args_check import env def split_word(fpath, tpath): data_dir = os.path.join("/home/chunwei/trunk", "tools/libTCWordSeg3.4.0/data") program = os.path.join(env.PROJECT_PATH, 'TCWordSeg') cmd = ' '.join([ program, data_dir, fpath, tpath, ]) print os.popen(cmd) if __name__ == "__main__": ppath, wpath = args_check(2, "cmd [paragraph_path] [words_path]") split_word(ppath, wpath) #remove_stop_words(wpath)
with open(fpath) as f: c = f.read() ws = c.split() ws = map(strip, ws) dic = Dic() dic.from_list(ws) dic.tofile(tpath) def gen_2gram_dic(fpath, tpath): print '.. dic: fpath>tpath', fpath, tpath with open(fpath) as f: c = f.read() ws = c.split() ws = map(strip, ws) two_grams = [] for i in range(len(ws)-1): wp = "%s-%s" % (ws[i], ws[i+1]) #print wp two_grams.append(wp) dic = Dic() dic.from_list(two_grams) dic.tofile(tpath) if __name__ == '__main__': ngram, fpath, tpath = args_check(3, "cmd [ngram] [fpath], [tpath]") if int(ngram) == 1: gen_dic(fpath, tpath) else: gen_2gram_dic(fpath, tpath)
def __call__(self): self.trans() self.tofile() def trans(self): num_lines = get_num_lines(self.test_ph) self.lines = [] with open(self.fph) as resf: with open(self.test_ph) as testf: for i in range(num_lines): #print i res = resf.readline() tes = testf.readline() label = res.strip() label = self.formats.get(label, int(label)) if label is None: break #print 'label:', label line = "%d\t%s" % (label, tes.strip()) self.lines.append(line) def tofile(self): with open(self.tph, 'w') as f: f.write('\n'.join(self.lines)) if __name__ == "__main__": fph, test_ph, tph = args_check(3, "") g = Gen(fph, test_ph, tph) g()
ws = map(strip, ws) two_grams = [] if len(ws) == 1: two_grams = ws else: for i in range(len(ws) - 1): word_pair = "%s-%s" % (ws[i], ws[i + 1]) two_grams.append(word_pair) return two_grams def parse(fpath, tpath): lines = [] with open(fpath) as f: for line in f.readlines(): s, t = line.split('\t') ss = gen2gram(s.split()) ts = gen2gram(t.split()) line = ' '.join(ss) \ + '\t' +\ ' '.join(ts) lines.append(line) with open(tpath, 'w') as f: f.write('\n'.join(lines)) if __name__ == '__main__': fpath, tpath = args_check(2, "") parse(fpath, tpath)
self.add_lst(datas) def test_scan(self): num_lines = get_num_lines(self.fpath) #self.lst = [ [] for i in range(num_lines)] labels = [1 for i in range(num_lines)] self.set_labels(labels) with open(self.fpath) as f: datas = [] for line in f.readlines(): ws = line.split() ws = map(strip, ws) ws = map(int, ws) datas.append(ws) self.add_lst(datas) if __name__ == '__main__': args = sys.argv[1:] _type = args[0] if _type == "word": _type, label_path, fpath, tpath = args_check( 4, "cmd [label_path] [fpath] [tpath]") w = WordLinear(label_path, fpath, tpath) w() elif _type == "test": _type, fpath, tpath = args_check(3, "cmd [label_path] [fpath] [tpath]") w = WordLinear(None, fpath, tpath) w(False)