Ejemplo n.º 1
0
    def train(self, beginEpoch, endEpoch, batchSize, showSeg):
        if beginEpoch == 0:
            self.setupDS()
            self.setupLM()
        else:
            self.load()
        self.opt = optimizers.Adam()
        self.opt.setup(self.lm)

        for ep in range(beginEpoch, endEpoch):
            indices = np.random.permutation(len(self.ds.idData))
            batches = module.pack(indices, batchSize)
            for i, batch in enumerate(batches):
                startTime = time()
                self.batchProcess(batch, showSeg)
                print('epoch:%d\tbatch:%d/%d\ttime:%f' %
                      (ep, i + 1, len(batches), time() - startTime))
            self.save()
Ejemplo n.º 2
0
    def segmentate(self, textPath, batchSize):
        self.load()

        # set text data
        self.ds.setIdData(textPath)

        batches = module.pack(np.arange(len(self.ds.idData)), batchSize)

        for batch in batches:
            inVoc = self.ds.getInVoc()
            idLines = [self.ds.idData[b][1:-1] for b in batch]
            sampler.setNgram(idLines, self.lm, self.ds, inVoc)

            for b in batch:
                idLine = self.ds.idData[b]
                self.ds.segData[b] = sampler.track(idLine[1:-1], self.lm,
                                                   self.ds)

                yield ' '.join(self.ds.getSegedLine(b))
Ejemplo n.º 3
0
    def evaluate(self, epoch):
        if self.gpuid >= 0:
            cuda.get_device(self.gpuid).use()
            self.model.to_gpu(self.gpuid)

        scores = []
        for ty in [1, 2]:  # ty=1,2 means valid, test respectively
            indices = list(range(len(self.ds.labels[ty])))
            batches = module.pack(indices, config.batchSize)

            preds = []
            tags = []

            print('eval...')
            startTime = time()
            for batch in tqdm(batches):
                lines = [self.ds.data[ty][b] for b in batch]
                ids = [
                    self.ds.getIdLine(indice=None,
                                      line=line,
                                      useChar=self.useChar,
                                      useWord=self.useWord,
                                      useCache=self.useCache,
                                      sampling=False,
                                      train=False,
                                      unkReplacingRate=0.0) for line in lines
                ]

                charIds = None
                charIds_cpu = None
                wordIds = None

                if self.useChar:
                    charIds_cpu = [idpair[0] for idpair in ids]
                    charIds_cpu = [[np.array(word, 'i') for word in idLine]
                                   for idLine in charIds_cpu]
                if self.useWord and not self.useCache:
                    wordIds = [idpair[1] for idpair in ids]
                    wordIds = [np.array(idLine, 'i') for idLine in wordIds]

                if self.gpuid >= 0:
                    if charIds_cpu:
                        charIds = [
                            cuda.to_gpu(idLine) for idLine in charIds_cpu
                        ]
                    if wordIds:
                        wordIds = cuda.to_gpu(wordIds)
                else:
                    charIds = charIds_cpu
                    wordIds = wordIds

                # downstream
                zs = self.model(charIds, wordIds, charIds_cpu)
                preds += np.argmax(zs.data, axis=1).tolist()

            print('time:', time() - startTime)

            score = f1_score(self.ds.labels[ty], preds, average='micro')
            scores.append(score)
            print('valid' if ty == 1 else 'test')
            print(score)
            print(classification_report(self.ds.labels[ty], preds))

        scores_str = '%f\t%f' % (scores[0], scores[1])
        self.eval_logger.write(scores_str)

        return scores
Ejemplo n.º 4
0
    def epochProcess(self, epoch, opt):
        print(epoch)

        indices = np.random.permutation(len(self.ds.idData[0]))
        batches = module.pack(indices, config.batchSize)
        st = time()

        for i, batch in enumerate(batches):
            self.model.cleargrads()
            ids = [
                self.ds.getIdLine(
                    indice=(0, b),  # indice=0 means train dataset
                    line=None,
                    useChar=self.useChar,
                    useWord=self.useWord,
                    useCache=self.useCache,
                    sampling=self.sampling,
                    train=self.uniTrain,
                    unkReplacingRate=0.0) for b in batch
            ]

            charIds = None
            charIds_cpu = None
            wordIds = None

            if self.useChar:
                charIds_cpu = [idpair[0] for idpair in ids]
                charIds_cpu = [[np.array(word, 'i') for word in idLine]
                               for idLine in charIds_cpu]
            if self.useWord and not self.useCache:
                wordIds = [idpair[1] for idpair in ids]
                wordIds = [np.array(idLine, 'i') for idLine in wordIds]

            ts_cpu = [self.ds.labels[0][b] for b in batch]
            ts_cpu = np.array(ts_cpu, 'i')

            if self.gpuid >= 0:
                if charIds_cpu:
                    charIds = [cuda.to_gpu(idLine) for idLine in charIds_cpu]
                if wordIds:
                    wordIds = cuda.to_gpu(wordIds)
                ts = cuda.to_gpu(ts_cpu)
            else:
                charIds = charIds_cpu
                wordIds = wordIds
                ts = ts_cpu

            loss = self.model.getLoss(charIds, wordIds, ts, charIds_cpu)

            if self.xp.isnan(loss.data):
                print('nan')
                exit()

            print('epoch:%d batch:(%d/%d) loss:%f' %
                  (epoch, i + 1, len(batches), loss.data.tolist()))

            #opt.setup(self.model)
            self.model.cleargrads()
            loss.backward()
            opt.update()
            self.loss_logger.write(loss.data)

        processTime = time() - st
        print('time:', processTime)
        print('time/sent', processTime / len(indices))
Ejemplo n.º 5
0
def evaluate(ds,model):
    if gpuid>=0:
        cuda.get_device(gpuid).use()
        model.to_gpu(gpuid)

    segLines = []
    dists = []
    golds = []
    
    scores = []
    for ty in [1,2]:#,2]: # ty=1,2 means valid, test respectively
        indices = list(range(len(ds.labels[ty])))
        batches = module.pack(indices, config.batchSize)

        preds = []
        tags = []


        for batch in batches:
            lines = [ds.data[ty][b] for b in batch]
            ids = [ds.getIdLine(indice=None,
                                     line=line,
                                     useChar=useChar,
                                     useWord=useWord,
                                     useCache=useCache,
                                     sampling=False,
                                     train=False,
                                     unkReplacingRate=0.0) for line in lines]

            charIds = None
            charIds_cpu = None
            wordIds = None

            if useChar:
                charIds_cpu = [idpair[0] for idpair in ids]
                charIds_cpu = [[np.array(word,'i') for word in idLine] for idLine in charIds_cpu]
            if useWord and not useCache:
                wordIds = [idpair[1] for idpair in ids]
                wordIds = [np.array(idLine, 'i') for idLine in wordIds]

            if gpuid>=0:
                if charIds_cpu:
                    charIds = [cuda.to_gpu(idLine) for idLine in charIds_cpu]
                if wordIds:
                    wordIds = cuda.to_gpu(wordIds)
            else:
                charIds = charIds_cpu
                wordIds = wordIds

            # downstream
            zs = model(charIds, wordIds, charIds_cpu)
            preds += np.argmax(zs.data, axis=1).tolist()
       
            zs = F.softmax(zs)
            for i,idLine in enumerate(ids):
                idLine = idLine[0] #char ids
                segLine = '_'.join([''.join(ds.ids2chars(w)) for w in idLine])
                dist = zs[i]
                b = batch[i]
                #print(segLine)
                #print(dist)
                #print(ds.labels[ty][b])

                segLines.append(segLine)
                dists.append(dist)
            golds += [ds.labels[ty][b] for b in batch]

        score = f1_score(ds.labels[ty], preds, average='micro')
        scores.append(score)
        print('valid' if ty==1 else 'test')
        print(score)
        print(classification_report(ds.labels[ty], preds))

    scores_str = '%f\t%f'%(scores[0],scores[1])
    print(score_str)

    return golds, segLines, dists