def MakeS2SDict(fn=None, min_freq=5, delimiter=' ', dict_file=None): if dict_file is not None and os.path.exists(dict_file): print('loading', dict_file) lst = ljqpy.LoadList(dict_file) midpos = lst.index('<@@@>') itokens = TokenList(lst[:midpos]) otokens = TokenList(lst[midpos + 1:]) return itokens, otokens data = ljqpy.LoadCSV(fn) wdicts = [{}, {}] for ss in data: for seq, wd in zip(ss, wdicts): for w in seq.split(delimiter): wd[w] = wd.get(w, 0) + 1 wlists = [] for wd in wdicts: wd = ljqpy.FreqDict2List(wd) wlist = [x for x, y in wd if y >= min_freq] wlists.append(wlist) print('seq 1 words:', len(wlists[0])) print('seq 2 words:', len(wlists[1])) itokens = TokenList(wlists[0]) otokens = TokenList(wlists[1]) if dict_file is not None: ljqpy.SaveList(wlists[0] + ['<@@@>'] + wlists[1], dict_file) return itokens, otokens
def MakeS2SDict(fn=None, min_freq=5, delimiter=' ', dict_file=None): ''' 构建input和output sequence的 word或char list :param fn: :param min_freq: :param delimiter: :param dict_file: :return: ''' # 如果有word/char list则不需要重新构建 if dict_file is not None and os.path.exists(dict_file): print('loading', dict_file) lst = ljqpy.LoadList(dict_file) midpos = lst.index('<@@@>') itokens = TokenList(lst[:midpos]) otokens = TokenList(lst[midpos+1:]) return itokens, otokens # 如果没有则重新构建 data = ljqpy.LoadCSV(fn) wdicts = [{}, {}] for ss in data: for seq, wd in zip(ss, wdicts): for w in seq.split(delimiter): wd[w] = wd.get(w, 0) + 1 # nice code wlists = [] for wd in wdicts: wd = ljqpy.FreqDict2List(wd) wlist = [x for x,y in wd if y >= min_freq] wlists.append(wlist) print('seq 1 words:', len(wlists[0])) print('seq 2 words:', len(wlists[1])) itokens = TokenList(wlists[0]) otokens = TokenList(wlists[1]) if dict_file is not None: ljqpy.SaveList(wlists[0]+['<@@@>']+wlists[1], dict_file) return itokens, otokens
def MakeMerged(): txts = [] for xx in ljqpy.LoadList('training/all_data.txt'): xx = json.loads(xx) txts.append(xx['text']) ljqpy.SaveList(txts, 'training/merged_text.txt')
def save_phrases(self): ljqpy.SaveList(self.newtags, 'training/phrases.txt')