def S2SDataGenerator(fn, itokens, otokens, batch_size=64, delimiter=' ', max_len=999): Xs = [[], []] while True: for ss in ljqpy.LoadCSVg(fn): for seq, xs in zip(ss, Xs): xs.append(list(seq.split(delimiter))) if len(Xs[0]) >= batch_size: X, Y = pad_to_longest(Xs[0], itokens, max_len), pad_to_longest(Xs[1], otokens, max_len) yield [X, Y], None Xs = [[], []]
def LoadFile(fn): items = {} for x in ljqpy.LoadCSVg(fn): st, ed = map(lambda z:[int(y) for y in z.split(',')], x[1:]) if x[0] in items: st = [max(u,v) for u, v in zip(items[x[0]][0], st)] ed = [max(u,v) for u, v in zip(items[x[0]][1], ed)] items[x[0]] = (st, ed) items = list(items.items()) return items
def __init__(self, dataFile, needtest=False, onejson=None): self.xQuestion = [] self.xContext = [] self.xQuestionC = [] self.xQuestionA = [] self.xContextC = [] self.xContextA = [] self.y_start = [] self.y_end = [] self.startEnd = [] self.contextRaw = [] self.realAnswer = [] self.questionRaw = [] self.questionId = [] if dataFile is None: self.ParseJson(onejson) self.ConvertNPArr() else: if not os.path.isdir('gen_data'): os.mkdir('gen_data') self.h5name = 'gen_data/' + os.path.split(dataFile)[-1] + '.h5' if os.path.exists(self.h5name): self.Load() else: print('MAKE H5') bad = 0 ii = 0 for line in ljqpy.LoadCSVg(dataFile): line = ''.join(line) ii += 1 if ii % 500 == 0: print(ii) thisJson = json.loads(line.strip().lower()) bad += self.ParseJson(thisJson) print('bad training samples:', bad) self.ConvertNPArr() self.Save() self.numSamples = self.xQuestion.shape[0] self.numQuestions = len(self.questionId)
def MakeVocab(): global id2w, w2id, id2c, c2id vocabFile = 'data/wordlist.txt' charFile = 'data/charlist.txt' if os.path.exists(vocabFile): freqw = ljqpy.LoadCSV(vocabFile) freqc = ljqpy.LoadCSV(charFile) else: freqw = {} freqc = {} for line in ljqpy.LoadCSVg(trainFile): line = ''.join(line) thisJson = json.loads(line.strip().lower()) question = thisJson["query"] question = re.sub(r'\s+', ' ', question.strip()) questionTokens = CutSentence(question) for t in questionTokens: for c in t: freqc[c] = freqc.get(c, 0) + 10 t = ChangeToken(t) freqw[t] = freqw.get(t, 0) + len(thisJson["passages"]) for passage in thisJson["passages"]: context = passage["passage_text"] context = FullToHalf(context) context = re.sub(r'\s+', ' ', context.strip()) contextTokens = CutSentence(context) for t in contextTokens: for c in t: freqc[c] = freqc.get(c, 0) + 1 t = ChangeToken(t) freqw[t] = freqw.get(t, 0) + 1 freqw = ljqpy.FreqDict2List(freqw) ljqpy.SaveCSV(freqw, vocabFile) freqc = ljqpy.FreqDict2List(freqc) ljqpy.SaveCSV(freqc, charFile) id2w = ['<PAD>', '<UNK>'] + [x[0] for x in freqw[:vocab_size]] w2id = {y: x for x, y in enumerate(id2w)} id2c = ['<PAD>', '<UNK>'] + [x[0] for x in freqc[:char_size]] c2id = {y: x for x, y in enumerate(id2c)}
def MakeS2SData(fn=None, itokens=None, otokens=None, delimiter=' ', h5_file=None, max_len=200): ''' 获取training data ''' # 如果存在数据集.h5文件,则直接读取,用于训练模型 if h5_file is not None and os.path.exists(h5_file): print('loading', h5_file) with h5py.File(h5_file) as dfile: X, Y = dfile['X'][:], dfile['Y'][:] return X, Y # 如果不存在则构建 data = ljqpy.LoadCSVg(fn) Xs = [[], []] for ss in data: for seq, xs in zip(ss, Xs): xs.append(list(seq.split(delimiter))) X, Y = pad_to_longest(Xs[0], itokens, max_len), pad_to_longest(Xs[1], otokens, max_len) if h5_file is not None: with h5py.File(h5_file, 'w') as dfile: dfile.create_dataset('X', data=X) dfile.create_dataset('Y', data=Y) return X, Y
def gen_new_tags(self, corpusfn, numlim=1000): global ng1, ng2, ng3, pg1, pg2, pg3, pdict, ndict, scores def _HH(p): return -p * math.log(p) if p > 0 else 0 def _HY(g3, g2): return _HH(ng3[g3] / ng2[g2]) ng1 = defaultdict(int) ng2 = defaultdict(int) ng3 = defaultdict(int) pdict, ndict = {}, {} cnum = 0 for ii, lines in enumerate(ljqpy.LoadCSVg(corpusfn)): line = lines[0] if ii % 100000 == 0: print('counting', ii) if line == '': continue if len(line) < 10: continue if re.search('[a-zA-Z\u4e00-\u9fa5]{2,}', line) is None: continue lln = jieba.lcut(line) lln = ['^'] + lln + ['$'] for i, wd in enumerate(lln): ng1[wd] += 1 if i > 0: ng2[tuple(lln[i - 1:i + 1])] += 1 if i > 1: ng3[tuple(lln[i - 2:i + 1])] += 1 if i > 1: pdict.setdefault(tuple(lln[i - 1:i + 1]), set()).add(lln[i - 2]) ndict.setdefault(tuple(lln[i - 2:i]), set()).add(lln[i]) cnum += len(lln) log_all_ng1 = math.log(sum(ng1.values())) log_all_ng2 = math.log(sum(ng2.values())) log_all_ng3 = math.log(sum(ng3.values())) pg1 = {k: math.log(v) - log_all_ng1 for k, v in ng1.items()} pg2 = {k: math.log(v) - log_all_ng2 for k, v in ng2.items()} pg3 = {k: math.log(v) - log_all_ng3 for k, v in ng3.items()} print('COUNT ok') # base_wp = {x:float(y) for x,y in ljqpy.LoadCSV('resources/base_wcounts.txt')} # pg1 = {k:(log_sum_exp([base_wp[k],v])-math.log(2) if k in base_wp else v) for k,v in pg1.items()} scores = {} ii = 0 for k, v in ljqpy.FreqDict2List(pg2): ii += 1 if ii % 10000 == 0: print('%d/%d' % (ii, len(pg2))) if max(ng1[k[0]], ng1[k[1]]) <= 3: continue pmi = v - pg1[k[0]] - pg1[k[1]] if pmi < 2: continue Hl, Hr = 0, 0 Hlr, Hrl = 0, 0 for ll in pdict.get(k, []): Hl += _HY((ll, k[0], k[1]), k) Hlr += _HY((ll, k[0], k[1]), (ll, k[0])) for rr in ndict.get(k, []): Hr += _HY((k[0], k[1], rr), k) Hrl += _HY((k[0], k[1], rr), (k[1], rr)) score = pmi - min(Hlr, Hrl) + min(Hl, Hr) if not ljqpy.IsChsStr(k[0] + k[1]): continue scores[k] = score * ng2[k] phrases = [] for k, v in ljqpy.FreqDict2List(scores)[:numlim]: print(k, v) phrases.append(''.join(k)) self.newtags = phrases self.newtagtrie = Trie({x: 1 for x in self.newtags}) return phrases