Ejemplo n.º 1
0
def S2SDataGenerator(fn, itokens, otokens, batch_size=64, delimiter=' ', max_len=999):
	Xs = [[], []]
	while True:
		for	ss in ljqpy.LoadCSVg(fn):
			for seq, xs in zip(ss, Xs):
				xs.append(list(seq.split(delimiter)))
			if len(Xs[0]) >= batch_size:
				X, Y = pad_to_longest(Xs[0], itokens, max_len), pad_to_longest(Xs[1], otokens, max_len)
				yield [X, Y], None
				Xs = [[], []]
Ejemplo n.º 2
0
def LoadFile(fn):
    items = {}
    for x in ljqpy.LoadCSVg(fn):
        st, ed = map(lambda z:[int(y) for y in z.split(',')], x[1:])
        if x[0] in items:
            st = [max(u,v) for u, v in zip(items[x[0]][0], st)]
            ed = [max(u,v) for u, v in zip(items[x[0]][1], ed)]
        items[x[0]] = (st, ed)
    items = list(items.items())
    return items
    def __init__(self, dataFile, needtest=False, onejson=None):
        self.xQuestion = []
        self.xContext = []
        self.xQuestionC = []
        self.xQuestionA = []
        self.xContextC = []
        self.xContextA = []
        self.y_start = []
        self.y_end = []
        self.startEnd = []
        self.contextRaw = []

        self.realAnswer = []
        self.questionRaw = []
        self.questionId = []

        if dataFile is None:
            self.ParseJson(onejson)
            self.ConvertNPArr()
        else:
            if not os.path.isdir('gen_data'): os.mkdir('gen_data')
            self.h5name = 'gen_data/' + os.path.split(dataFile)[-1] + '.h5'
            if os.path.exists(self.h5name):
                self.Load()
            else:
                print('MAKE H5')
                bad = 0
                ii = 0
                for line in ljqpy.LoadCSVg(dataFile):
                    line = ''.join(line)
                    ii += 1
                    if ii % 500 == 0: print(ii)
                    thisJson = json.loads(line.strip().lower())
                    bad += self.ParseJson(thisJson)
                print('bad training samples:', bad)
                self.ConvertNPArr()
                self.Save()

        self.numSamples = self.xQuestion.shape[0]
        self.numQuestions = len(self.questionId)
def MakeVocab():
    global id2w, w2id, id2c, c2id
    vocabFile = 'data/wordlist.txt'
    charFile = 'data/charlist.txt'
    if os.path.exists(vocabFile):
        freqw = ljqpy.LoadCSV(vocabFile)
        freqc = ljqpy.LoadCSV(charFile)
    else:
        freqw = {}
        freqc = {}
        for line in ljqpy.LoadCSVg(trainFile):
            line = ''.join(line)
            thisJson = json.loads(line.strip().lower())
            question = thisJson["query"]
            question = re.sub(r'\s+', ' ', question.strip())
            questionTokens = CutSentence(question)
            for t in questionTokens:
                for c in t:
                    freqc[c] = freqc.get(c, 0) + 10
                t = ChangeToken(t)
                freqw[t] = freqw.get(t, 0) + len(thisJson["passages"])
            for passage in thisJson["passages"]:
                context = passage["passage_text"]
                context = FullToHalf(context)
                context = re.sub(r'\s+', ' ', context.strip())
                contextTokens = CutSentence(context)
                for t in contextTokens:
                    for c in t:
                        freqc[c] = freqc.get(c, 0) + 1
                    t = ChangeToken(t)
                    freqw[t] = freqw.get(t, 0) + 1
        freqw = ljqpy.FreqDict2List(freqw)
        ljqpy.SaveCSV(freqw, vocabFile)
        freqc = ljqpy.FreqDict2List(freqc)
        ljqpy.SaveCSV(freqc, charFile)
    id2w = ['<PAD>', '<UNK>'] + [x[0] for x in freqw[:vocab_size]]
    w2id = {y: x for x, y in enumerate(id2w)}
    id2c = ['<PAD>', '<UNK>'] + [x[0] for x in freqc[:char_size]]
    c2id = {y: x for x, y in enumerate(id2c)}
Ejemplo n.º 5
0
def MakeS2SData(fn=None, itokens=None, otokens=None, delimiter=' ', h5_file=None, max_len=200):
	'''
	获取training data
	'''
	# 如果存在数据集.h5文件,则直接读取,用于训练模型
	if h5_file is not None and os.path.exists(h5_file):
		print('loading', h5_file)
		with h5py.File(h5_file) as dfile:
			X, Y = dfile['X'][:], dfile['Y'][:]
		return X, Y
	# 如果不存在则构建
	data = ljqpy.LoadCSVg(fn)
	Xs = [[], []]
	for ss in data:
		for seq, xs in zip(ss, Xs):
			xs.append(list(seq.split(delimiter)))
	X, Y = pad_to_longest(Xs[0], itokens, max_len), pad_to_longest(Xs[1], otokens, max_len)
	if h5_file is not None:
		with h5py.File(h5_file, 'w') as dfile:
			dfile.create_dataset('X', data=X)
			dfile.create_dataset('Y', data=Y)
	return X, Y
Ejemplo n.º 6
0
    def gen_new_tags(self, corpusfn, numlim=1000):
        global ng1, ng2, ng3, pg1, pg2, pg3, pdict, ndict, scores

        def _HH(p):
            return -p * math.log(p) if p > 0 else 0

        def _HY(g3, g2):
            return _HH(ng3[g3] / ng2[g2])

        ng1 = defaultdict(int)
        ng2 = defaultdict(int)
        ng3 = defaultdict(int)
        pdict, ndict = {}, {}
        cnum = 0
        for ii, lines in enumerate(ljqpy.LoadCSVg(corpusfn)):
            line = lines[0]
            if ii % 100000 == 0: print('counting', ii)
            if line == '': continue
            if len(line) < 10: continue
            if re.search('[a-zA-Z\u4e00-\u9fa5]{2,}', line) is None: continue
            lln = jieba.lcut(line)
            lln = ['^'] + lln + ['$']
            for i, wd in enumerate(lln):
                ng1[wd] += 1
                if i > 0: ng2[tuple(lln[i - 1:i + 1])] += 1
                if i > 1: ng3[tuple(lln[i - 2:i + 1])] += 1
                if i > 1:
                    pdict.setdefault(tuple(lln[i - 1:i + 1]),
                                     set()).add(lln[i - 2])
                    ndict.setdefault(tuple(lln[i - 2:i]), set()).add(lln[i])
            cnum += len(lln)
        log_all_ng1 = math.log(sum(ng1.values()))
        log_all_ng2 = math.log(sum(ng2.values()))
        log_all_ng3 = math.log(sum(ng3.values()))
        pg1 = {k: math.log(v) - log_all_ng1 for k, v in ng1.items()}
        pg2 = {k: math.log(v) - log_all_ng2 for k, v in ng2.items()}
        pg3 = {k: math.log(v) - log_all_ng3 for k, v in ng3.items()}
        print('COUNT ok')

        # base_wp = {x:float(y) for x,y in ljqpy.LoadCSV('resources/base_wcounts.txt')}
        # pg1 = {k:(log_sum_exp([base_wp[k],v])-math.log(2) if k in base_wp else v) for k,v in pg1.items()}

        scores = {}
        ii = 0
        for k, v in ljqpy.FreqDict2List(pg2):
            ii += 1
            if ii % 10000 == 0: print('%d/%d' % (ii, len(pg2)))
            if max(ng1[k[0]], ng1[k[1]]) <= 3: continue
            pmi = v - pg1[k[0]] - pg1[k[1]]
            if pmi < 2: continue
            Hl, Hr = 0, 0
            Hlr, Hrl = 0, 0
            for ll in pdict.get(k, []):
                Hl += _HY((ll, k[0], k[1]), k)
                Hlr += _HY((ll, k[0], k[1]), (ll, k[0]))
            for rr in ndict.get(k, []):
                Hr += _HY((k[0], k[1], rr), k)
                Hrl += _HY((k[0], k[1], rr), (k[1], rr))
            score = pmi - min(Hlr, Hrl) + min(Hl, Hr)
            if not ljqpy.IsChsStr(k[0] + k[1]): continue
            scores[k] = score * ng2[k]

        phrases = []
        for k, v in ljqpy.FreqDict2List(scores)[:numlim]:
            print(k, v)
            phrases.append(''.join(k))
        self.newtags = phrases
        self.newtagtrie = Trie({x: 1 for x in self.newtags})
        return phrases