def MakeS2SDict(fn=None, min_freq=5, delimiter=' ', dict_file=None):
    if dict_file is not None and os.path.exists(dict_file):
        print('loading', dict_file)
        lst = ljqpy.LoadList(dict_file)
        midpos = lst.index('<@@@>')
        itokens = TokenList(lst[:midpos])
        otokens = TokenList(lst[midpos + 1:])
        return itokens, otokens
    data = ljqpy.LoadCSV(fn)
    wdicts = [{}, {}]
    for ss in data:
        for seq, wd in zip(ss, wdicts):
            for w in seq.split(delimiter):
                wd[w] = wd.get(w, 0) + 1
    wlists = []
    for wd in wdicts:
        wd = ljqpy.FreqDict2List(wd)
        wlist = [x for x, y in wd if y >= min_freq]
        wlists.append(wlist)
    print('seq 1 words:', len(wlists[0]))
    print('seq 2 words:', len(wlists[1]))
    itokens = TokenList(wlists[0])
    otokens = TokenList(wlists[1])
    if dict_file is not None:
        ljqpy.SaveList(wlists[0] + ['<@@@>'] + wlists[1], dict_file)
    return itokens, otokens
Esempio n. 2
0
def MakeS2SDict(fn=None, min_freq=5, delimiter=' ', dict_file=None):
	'''
	构建input和output sequence的 word或char list
	:param fn: 
	:param min_freq: 
	:param delimiter: 
	:param dict_file: 
	:return: 
	'''
	# 如果有word/char list则不需要重新构建
	if dict_file is not None and os.path.exists(dict_file):
		print('loading', dict_file)
		lst = ljqpy.LoadList(dict_file)
		midpos = lst.index('<@@@>')
		itokens = TokenList(lst[:midpos])
		otokens = TokenList(lst[midpos+1:])
		return itokens, otokens
	# 如果没有则重新构建
	data = ljqpy.LoadCSV(fn)
	wdicts = [{}, {}]
	for ss in data:
		for seq, wd in zip(ss, wdicts):
			for w in seq.split(delimiter): 
				wd[w] = wd.get(w, 0) + 1  # nice code
	wlists = []
	for wd in wdicts:	
		wd = ljqpy.FreqDict2List(wd)
		wlist = [x for x,y in wd if y >= min_freq]
		wlists.append(wlist)
	print('seq 1 words:', len(wlists[0]))
	print('seq 2 words:', len(wlists[1]))
	itokens = TokenList(wlists[0])
	otokens = TokenList(wlists[1])
	if dict_file is not None:
		ljqpy.SaveList(wlists[0]+['<@@@>']+wlists[1], dict_file)
	return itokens, otokens
Esempio n. 3
0
def MakeMerged():
    txts = []
    for xx in ljqpy.LoadList('training/all_data.txt'):
        xx = json.loads(xx)
        txts.append(xx['text'])
    ljqpy.SaveList(txts, 'training/merged_text.txt')
Esempio n. 4
0
 def save_phrases(self):
     ljqpy.SaveList(self.newtags, 'training/phrases.txt')