def makeType(train, valid): global typelen global type2num, num2type global validl answertypelist = [] trainl = ljqpy.LoadCSV(train) validl = ljqpy.LoadCSV(valid) for each in trainl: answertype = each[2] answertypelist.append(answertype) for each2 in validl: answertype2 = each2[2] answertypelist.append(answertype2) answertypelist = list(set(answertypelist)) typelen = len(answertypelist) type2num = {y: x for x, y in enumerate(answertypelist)} num2type = {val: key for key, val in type2num.items()} qy = [] for each in trainl: t = each[2] qy.append(type2num[t]) tyq = [] for each in validl: t = each[2] tyq.append(type2num[t]) return np.array(qy), np.array(tyq)
def MakeVocab(): global id2w, w2id, id2c, c2id freqw = [] freqc = [] if os.path.exists(vocabFile): freqw = ljqpy.LoadCSV(vocabFile) freqc = ljqpy.LoadCSV(charFile) else: print('wordlist or charlist is not find') id2w = ['<PAD>', '<UNK>'] + [x[0] for x in freqw[:vocab_size]] w2id = {y: x for x, y in enumerate(id2w)} id2c = ['<PAD>', '<UNK>'] + [x[0] for x in freqc[:char_size]] c2id = {y: x for x, y in enumerate(id2c)}
def MakeS2SDict(fn=None, min_freq=5, delimiter=' ', dict_file=None): if dict_file is not None and os.path.exists(dict_file): print('loading', dict_file) lst = ljqpy.LoadList(dict_file) midpos = lst.index('<@@@>') itokens = TokenList(lst[:midpos]) otokens = TokenList(lst[midpos + 1:]) return itokens, otokens data = ljqpy.LoadCSV(fn) wdicts = [{}, {}] for ss in data: for seq, wd in zip(ss, wdicts): for w in seq.split(delimiter): wd[w] = wd.get(w, 0) + 1 wlists = [] for wd in wdicts: wd = ljqpy.FreqDict2List(wd) wlist = [x for x, y in wd if y >= min_freq] wlists.append(wlist) print('seq 1 words:', len(wlists[0])) print('seq 2 words:', len(wlists[1])) itokens = TokenList(wlists[0]) otokens = TokenList(wlists[1]) if dict_file is not None: ljqpy.SaveList(wlists[0] + ['<@@@>'] + wlists[1], dict_file) return itokens, otokens
def Load(self): with h5py.File(self.h5name) as dfile: self.xQuestion = dfile['xQuestion'][:] self.xContext = dfile['xContext'][:] self.xQuestionC = dfile['xQuestionC'][:] self.xQuestionA = dfile['xQuestionA'][:] self.xContextC = dfile['xContextC'][:] self.xContextA = dfile['xContextA'][:] self.y_start = dfile['y_start'][:] self.y_end = dfile['y_end'][:] self.startEnd = dfile['startEnd'][:] data = ljqpy.LoadCSV(self.h5name + '.txt') self.questionId = [x[0] for x in data] self.questionRaw = [x[1] for x in data] self.realAnswer = [x[2] for x in data] if os.path.exists(self.h5name + '.c.txt'): self.contextRaw = ljqpy.LoadCSV(self.h5name + '.c.txt')
def MakeOwnDatas(train): xQuestion = [] xQuestionC = [] for line in ljqpy.LoadCSV(train): question = line[0] questionTokens = CutSentence(question) xq = Tokens2Intlist(questionTokens, maxQLen) xqc = Chars2Intlist(questionTokens, maxQLen) xQuestion.append(xq) xQuestionC.append(xqc) return np.array(xQuestion), np.array(xQuestionC)
def MakeVocab(): global id2w, w2id, id2c, c2id vocabFile = 'data/wordlist.txt' charFile = 'data/charlist.txt' if os.path.exists(vocabFile): freqw = ljqpy.LoadCSV(vocabFile) freqc = ljqpy.LoadCSV(charFile) else: freqw = {} freqc = {} for line in ljqpy.LoadCSVg(trainFile): line = ''.join(line) thisJson = json.loads(line.strip().lower()) question = thisJson["query"] question = re.sub(r'\s+', ' ', question.strip()) questionTokens = CutSentence(question) for t in questionTokens: for c in t: freqc[c] = freqc.get(c, 0) + 10 t = ChangeToken(t) freqw[t] = freqw.get(t, 0) + len(thisJson["passages"]) for passage in thisJson["passages"]: context = passage["passage_text"] context = FullToHalf(context) context = re.sub(r'\s+', ' ', context.strip()) contextTokens = CutSentence(context) for t in contextTokens: for c in t: freqc[c] = freqc.get(c, 0) + 1 t = ChangeToken(t) freqw[t] = freqw.get(t, 0) + 1 freqw = ljqpy.FreqDict2List(freqw) ljqpy.SaveCSV(freqw, vocabFile) freqc = ljqpy.FreqDict2List(freqc) ljqpy.SaveCSV(freqc, charFile) id2w = ['<PAD>', '<UNK>'] + [x[0] for x in freqw[:vocab_size]] w2id = {y: x for x, y in enumerate(id2w)} id2c = ['<PAD>', '<UNK>'] + [x[0] for x in freqc[:char_size]] c2id = {y: x for x, y in enumerate(id2c)}
def ReadQuestionAnswers(): global qidAnswers fnlist = [ './train_data/qid_answer_expand', './train_data/qid_answer_expand.valid' ] qidAnswers = {} for fn in fnlist: for tokens in ljqpy.LoadCSV(fn): if len(tokens) != 3: continue qid = tokens[0] answers = tokens[2].split('|') qidAnswers[qid] = set(answers)
def MakeS2SDict(fn=None, min_freq=5, delimiter=' ', dict_file=None): ''' 构建input和output sequence的 word或char list :param fn: :param min_freq: :param delimiter: :param dict_file: :return: ''' # 如果有word/char list则不需要重新构建 if dict_file is not None and os.path.exists(dict_file): print('loading', dict_file) lst = ljqpy.LoadList(dict_file) midpos = lst.index('<@@@>') itokens = TokenList(lst[:midpos]) otokens = TokenList(lst[midpos+1:]) return itokens, otokens # 如果没有则重新构建 data = ljqpy.LoadCSV(fn) wdicts = [{}, {}] for ss in data: for seq, wd in zip(ss, wdicts): for w in seq.split(delimiter): wd[w] = wd.get(w, 0) + 1 # nice code wlists = [] for wd in wdicts: wd = ljqpy.FreqDict2List(wd) wlist = [x for x,y in wd if y >= min_freq] wlists.append(wlist) print('seq 1 words:', len(wlists[0])) print('seq 2 words:', len(wlists[1])) itokens = TokenList(wlists[0]) otokens = TokenList(wlists[1]) if dict_file is not None: ljqpy.SaveList(wlists[0]+['<@@@>']+wlists[1], dict_file) return itokens, otokens
print(x, y) print( s2s.decode_sequence_readout('A black dog eats food .'.split(), delimiter=' ')) print( s2s.decode_sequence_fast('A black dog eats food .'.split(), delimiter=' ')) while True: quest = input('> ') print(s2s.decode_sequence_fast(quest.split(), delimiter=' ')) rets = s2s.beam_search(quest.split(), delimiter=' ') for x, y in rets: print(x, y) elif 'test' in sys.argv: import ljqpy valids = ljqpy.LoadCSV('data/en2de.s2s.valid.txt') en = [x[0].split() for x in valids[:100]] rets = s2s.decode_sequence_readout(en, delimiter=' ') for x in rets[:5]: print(x) rets = s2s.beam_search(en, delimiter=' ', verbose=1) for i, x in enumerate(rets[:5]): print('-' * 20) print(valids[i][1]) for y in x: print(y) rets = s2s.decode_sequence_fast(en, delimiter=' ', verbose=1) for x in rets[:5]: print(x)