Ejemplo n.º 1
0
        ws = c.split()
        ws = map(strip, ws)
        dic = Dic()
        dic.from_list(ws)
        dic.tofile(tpath)


def gen_2gram_dic(fpath, tpath):
    print '.. dic: fpath>tpath', fpath, tpath
    with open(fpath) as f:
        c = f.read()
        ws = c.split()
        ws = map(strip, ws)
        two_grams = []
        for i in range(len(ws) - 1):
            wp = "%s-%s" % (ws[i], ws[i + 1])
            #print wp
            two_grams.append(wp)

        dic = Dic()
        dic.from_list(two_grams)
        dic.tofile(tpath)


if __name__ == '__main__':
    ngram, fpath, tpath = args_check(3, "cmd [ngram] [fpath], [tpath]")
    if int(ngram) == 1:
        gen_dic(fpath, tpath)
    else:
        gen_2gram_dic(fpath, tpath)
Ejemplo n.º 2
0
            self.add_lst(datas)

    def test_scan(self):
        num_lines = get_num_lines(self.fpath)
        #self.lst = [ [] for i in range(num_lines)]
        labels = [1 for i in range(num_lines)]
        self.set_labels(labels)
        with open(self.fpath) as f:
            datas = []
            for line in f.readlines():
                ws = line.split()
                ws = map(strip, ws)
                ws = map(int, ws)
                datas.append(ws)
            self.add_lst(datas)



if __name__ == '__main__':
    args = sys.argv[1:]
    _type = args[0]

    if _type == "word":
        _type, label_path, fpath, tpath = args_check(4, "cmd [label_path] [fpath] [tpath]")
        w = WordLinear(label_path, fpath, tpath)
        w()
    elif _type == "test":
        _type, fpath, tpath = args_check(3, "cmd [label_path] [fpath] [tpath]")
        w = WordLinear(None, fpath, tpath)
        w(False)
Ejemplo n.º 3
0
    with open(env.SPLITED_SOURCE_TITLES_PH) as sf:
        with open(env.SPLITED_TARGET_TITLES_PH) as tf:
            for i in range(num_rcds):
                source = sf.readline().decode('gbk','ignore').encode('utf8')
                target = tf.readline().decode('gbk','ignore').encode('utf8')
                source = source.strip().split()
                target = target.strip().split()
                s_rcds = [dic.get(w) for w in source]
                t_rcds = [dic.get(w) for w in target]
                #print s_rcds, t_rcds
                records.append(
                    ' '.join(map(str, s_rcds)) + '\t' +
                    ' '.join(map(str, t_rcds))
                )
                with open(env.WORD_ID_SOURCE_TARGET_PH, 'w') as f:
                    f.write('\n'.join(records))


if __name__ == "__main__":
    args = args_check(1, "cmd [action]")
    #record2db()
    actions = {
            'record2db' : record2db,
            'gen_target_title_file': gen_target_title_file,
            'gen_source_title_file': gen_source_title_file,
            'gen_word_dic': gen_word_dic,
            'gen_word_id_titles': gen_word_id_titles,
    }
    action = args[0]
    actions[action]()
Ejemplo n.º 4
0
        self.trans()
        self.tofile()

    def trans(self):
        num_lines = get_num_lines(self.test_ph)
        self.lines = []
        with open(self.fph) as resf:
            with open(self.test_ph) as testf:
                for i in range(num_lines):
                    #print i
                    res = resf.readline()
                    tes = testf.readline()
                    label = res.strip()
                    label = self.formats.get(label, int(label))
                    if label is None:
                        break
                    #print 'label:', label
                    line = "%d\t%s" % (label, tes.strip())
                    self.lines.append(line)

    def tofile(self):
        with open(self.tph, 'w')  as f:
            f.write('\n'.join(self.lines))



if __name__ == "__main__":
    fph, test_ph, tph = args_check(3, "")
    g = Gen(fph, test_ph, tph)
    g()
Ejemplo n.º 5
0
    print 'num of rcds:', num_rcds
    records = []
    with open(env.SPLITED_SOURCE_TITLES_PH) as sf:
        with open(env.SPLITED_TARGET_TITLES_PH) as tf:
            for i in range(num_rcds):
                source = sf.readline().decode('gbk', 'ignore').encode('utf8')
                target = tf.readline().decode('gbk', 'ignore').encode('utf8')
                source = source.strip().split()
                target = target.strip().split()
                s_rcds = [dic.get(w) for w in source]
                t_rcds = [dic.get(w) for w in target]
                #print s_rcds, t_rcds
                records.append(' '.join(map(str, s_rcds)) + '\t' +
                               ' '.join(map(str, t_rcds)))
                with open(env.WORD_ID_SOURCE_TARGET_PH, 'w') as f:
                    f.write('\n'.join(records))


if __name__ == "__main__":
    args = args_check(1, "cmd [action]")
    #record2db()
    actions = {
        'record2db': record2db,
        'gen_target_title_file': gen_target_title_file,
        'gen_source_title_file': gen_source_title_file,
        'gen_word_dic': gen_word_dic,
        'gen_word_id_titles': gen_word_id_titles,
    }
    action = args[0]
    actions[action]()
Ejemplo n.º 6
0
Created on Aug 9, 2013

@author: Chunwei Yan @ pkusz
@mail:  [email protected]
'''
from __future__ import division
import os
import sys
sys.path.append('..')

from utils import args_check
import env

def split_word(fpath, tpath):
    data_dir = os.path.join("/home/chunwei/trunk", "tools/libTCWordSeg3.4.0/data")
    program = os.path.join(env.PROJECT_PATH,'TCWordSeg')
    cmd = ' '.join([
            program, data_dir,
            fpath, tpath,
            ])

    print os.popen(cmd)



if __name__ == "__main__":
    ppath, wpath = args_check(2, "cmd [paragraph_path] [words_path]")

    split_word(ppath, wpath)
    #remove_stop_words(wpath)
Ejemplo n.º 7
0
@mail:  [email protected]
'''
from __future__ import division
import os
import sys
sys.path.append('..')

from utils import args_check
import env


def split_word(fpath, tpath):
    data_dir = os.path.join("/home/chunwei/trunk",
                            "tools/libTCWordSeg3.4.0/data")
    program = os.path.join(env.PROJECT_PATH, 'TCWordSeg')
    cmd = ' '.join([
        program,
        data_dir,
        fpath,
        tpath,
    ])

    print os.popen(cmd)


if __name__ == "__main__":
    ppath, wpath = args_check(2, "cmd [paragraph_path] [words_path]")

    split_word(ppath, wpath)
    #remove_stop_words(wpath)
Ejemplo n.º 8
0
    with open(fpath) as f:
        c = f.read()
        ws = c.split()
        ws = map(strip, ws)
        dic = Dic()
        dic.from_list(ws)
        dic.tofile(tpath)

def gen_2gram_dic(fpath, tpath):
    print '.. dic: fpath>tpath', fpath, tpath
    with open(fpath) as f:
        c = f.read()
        ws = c.split()
        ws = map(strip, ws)
        two_grams = []
        for i in range(len(ws)-1):
            wp = "%s-%s" % (ws[i], ws[i+1])
            #print wp
            two_grams.append(wp)

        dic = Dic()
        dic.from_list(two_grams)
        dic.tofile(tpath)

if __name__ == '__main__':
    ngram, fpath, tpath = args_check(3, "cmd [ngram] [fpath], [tpath]")
    if int(ngram) == 1:
        gen_dic(fpath, tpath)
    else:
        gen_2gram_dic(fpath, tpath)
Ejemplo n.º 9
0
    def __call__(self):
        self.trans()
        self.tofile()

    def trans(self):
        num_lines = get_num_lines(self.test_ph)
        self.lines = []
        with open(self.fph) as resf:
            with open(self.test_ph) as testf:
                for i in range(num_lines):
                    #print i
                    res = resf.readline()
                    tes = testf.readline()
                    label = res.strip()
                    label = self.formats.get(label, int(label))
                    if label is None:
                        break
                    #print 'label:', label
                    line = "%d\t%s" % (label, tes.strip())
                    self.lines.append(line)

    def tofile(self):
        with open(self.tph, 'w') as f:
            f.write('\n'.join(self.lines))


if __name__ == "__main__":
    fph, test_ph, tph = args_check(3, "")
    g = Gen(fph, test_ph, tph)
    g()
Ejemplo n.º 10
0
    ws = map(strip, ws)
    two_grams = []
    if len(ws) == 1:
        two_grams = ws
    else:
        for i in range(len(ws) - 1):
            word_pair = "%s-%s" % (ws[i], ws[i + 1])
            two_grams.append(word_pair)
    return two_grams


def parse(fpath, tpath):
    lines = []
    with open(fpath) as f:
        for line in f.readlines():
            s, t = line.split('\t')
            ss = gen2gram(s.split())
            ts = gen2gram(t.split())
            line = ' '.join(ss) \
                    + '\t' +\
                    ' '.join(ts)
            lines.append(line)

    with open(tpath, 'w') as f:
        f.write('\n'.join(lines))


if __name__ == '__main__':
    fpath, tpath = args_check(2, "")
    parse(fpath, tpath)
Ejemplo n.º 11
0
            self.add_lst(datas)

    def test_scan(self):
        num_lines = get_num_lines(self.fpath)
        #self.lst = [ [] for i in range(num_lines)]
        labels = [1 for i in range(num_lines)]
        self.set_labels(labels)
        with open(self.fpath) as f:
            datas = []
            for line in f.readlines():
                ws = line.split()
                ws = map(strip, ws)
                ws = map(int, ws)
                datas.append(ws)
            self.add_lst(datas)


if __name__ == '__main__':
    args = sys.argv[1:]
    _type = args[0]

    if _type == "word":
        _type, label_path, fpath, tpath = args_check(
            4, "cmd [label_path] [fpath] [tpath]")
        w = WordLinear(label_path, fpath, tpath)
        w()
    elif _type == "test":
        _type, fpath, tpath = args_check(3, "cmd [label_path] [fpath] [tpath]")
        w = WordLinear(None, fpath, tpath)
        w(False)