def train(self, beginEpoch, endEpoch, batchSize, showSeg): if beginEpoch == 0: self.setupDS() self.setupLM() else: self.load() self.opt = optimizers.Adam() self.opt.setup(self.lm) for ep in range(beginEpoch, endEpoch): indices = np.random.permutation(len(self.ds.idData)) batches = module.pack(indices, batchSize) for i, batch in enumerate(batches): startTime = time() self.batchProcess(batch, showSeg) print('epoch:%d\tbatch:%d/%d\ttime:%f' % (ep, i + 1, len(batches), time() - startTime)) self.save()
def segmentate(self, textPath, batchSize): self.load() # set text data self.ds.setIdData(textPath) batches = module.pack(np.arange(len(self.ds.idData)), batchSize) for batch in batches: inVoc = self.ds.getInVoc() idLines = [self.ds.idData[b][1:-1] for b in batch] sampler.setNgram(idLines, self.lm, self.ds, inVoc) for b in batch: idLine = self.ds.idData[b] self.ds.segData[b] = sampler.track(idLine[1:-1], self.lm, self.ds) yield ' '.join(self.ds.getSegedLine(b))
def evaluate(self, epoch): if self.gpuid >= 0: cuda.get_device(self.gpuid).use() self.model.to_gpu(self.gpuid) scores = [] for ty in [1, 2]: # ty=1,2 means valid, test respectively indices = list(range(len(self.ds.labels[ty]))) batches = module.pack(indices, config.batchSize) preds = [] tags = [] print('eval...') startTime = time() for batch in tqdm(batches): lines = [self.ds.data[ty][b] for b in batch] ids = [ self.ds.getIdLine(indice=None, line=line, useChar=self.useChar, useWord=self.useWord, useCache=self.useCache, sampling=False, train=False, unkReplacingRate=0.0) for line in lines ] charIds = None charIds_cpu = None wordIds = None if self.useChar: charIds_cpu = [idpair[0] for idpair in ids] charIds_cpu = [[np.array(word, 'i') for word in idLine] for idLine in charIds_cpu] if self.useWord and not self.useCache: wordIds = [idpair[1] for idpair in ids] wordIds = [np.array(idLine, 'i') for idLine in wordIds] if self.gpuid >= 0: if charIds_cpu: charIds = [ cuda.to_gpu(idLine) for idLine in charIds_cpu ] if wordIds: wordIds = cuda.to_gpu(wordIds) else: charIds = charIds_cpu wordIds = wordIds # downstream zs = self.model(charIds, wordIds, charIds_cpu) preds += np.argmax(zs.data, axis=1).tolist() print('time:', time() - startTime) score = f1_score(self.ds.labels[ty], preds, average='micro') scores.append(score) print('valid' if ty == 1 else 'test') print(score) print(classification_report(self.ds.labels[ty], preds)) scores_str = '%f\t%f' % (scores[0], scores[1]) self.eval_logger.write(scores_str) return scores
def epochProcess(self, epoch, opt): print(epoch) indices = np.random.permutation(len(self.ds.idData[0])) batches = module.pack(indices, config.batchSize) st = time() for i, batch in enumerate(batches): self.model.cleargrads() ids = [ self.ds.getIdLine( indice=(0, b), # indice=0 means train dataset line=None, useChar=self.useChar, useWord=self.useWord, useCache=self.useCache, sampling=self.sampling, train=self.uniTrain, unkReplacingRate=0.0) for b in batch ] charIds = None charIds_cpu = None wordIds = None if self.useChar: charIds_cpu = [idpair[0] for idpair in ids] charIds_cpu = [[np.array(word, 'i') for word in idLine] for idLine in charIds_cpu] if self.useWord and not self.useCache: wordIds = [idpair[1] for idpair in ids] wordIds = [np.array(idLine, 'i') for idLine in wordIds] ts_cpu = [self.ds.labels[0][b] for b in batch] ts_cpu = np.array(ts_cpu, 'i') if self.gpuid >= 0: if charIds_cpu: charIds = [cuda.to_gpu(idLine) for idLine in charIds_cpu] if wordIds: wordIds = cuda.to_gpu(wordIds) ts = cuda.to_gpu(ts_cpu) else: charIds = charIds_cpu wordIds = wordIds ts = ts_cpu loss = self.model.getLoss(charIds, wordIds, ts, charIds_cpu) if self.xp.isnan(loss.data): print('nan') exit() print('epoch:%d batch:(%d/%d) loss:%f' % (epoch, i + 1, len(batches), loss.data.tolist())) #opt.setup(self.model) self.model.cleargrads() loss.backward() opt.update() self.loss_logger.write(loss.data) processTime = time() - st print('time:', processTime) print('time/sent', processTime / len(indices))
def evaluate(ds,model): if gpuid>=0: cuda.get_device(gpuid).use() model.to_gpu(gpuid) segLines = [] dists = [] golds = [] scores = [] for ty in [1,2]:#,2]: # ty=1,2 means valid, test respectively indices = list(range(len(ds.labels[ty]))) batches = module.pack(indices, config.batchSize) preds = [] tags = [] for batch in batches: lines = [ds.data[ty][b] for b in batch] ids = [ds.getIdLine(indice=None, line=line, useChar=useChar, useWord=useWord, useCache=useCache, sampling=False, train=False, unkReplacingRate=0.0) for line in lines] charIds = None charIds_cpu = None wordIds = None if useChar: charIds_cpu = [idpair[0] for idpair in ids] charIds_cpu = [[np.array(word,'i') for word in idLine] for idLine in charIds_cpu] if useWord and not useCache: wordIds = [idpair[1] for idpair in ids] wordIds = [np.array(idLine, 'i') for idLine in wordIds] if gpuid>=0: if charIds_cpu: charIds = [cuda.to_gpu(idLine) for idLine in charIds_cpu] if wordIds: wordIds = cuda.to_gpu(wordIds) else: charIds = charIds_cpu wordIds = wordIds # downstream zs = model(charIds, wordIds, charIds_cpu) preds += np.argmax(zs.data, axis=1).tolist() zs = F.softmax(zs) for i,idLine in enumerate(ids): idLine = idLine[0] #char ids segLine = '_'.join([''.join(ds.ids2chars(w)) for w in idLine]) dist = zs[i] b = batch[i] #print(segLine) #print(dist) #print(ds.labels[ty][b]) segLines.append(segLine) dists.append(dist) golds += [ds.labels[ty][b] for b in batch] score = f1_score(ds.labels[ty], preds, average='micro') scores.append(score) print('valid' if ty==1 else 'test') print(score) print(classification_report(ds.labels[ty], preds)) scores_str = '%f\t%f'%(scores[0],scores[1]) print(score_str) return golds, segLines, dists