class Labeler:
    def __init__(self):
        self.word_state = {}
        self.label_state = {}
        self.hyperParams = HyperParams()

    def createAlphabet(self, trainInsts, devInsts, testInsts):
        print("create alpha.................")
        for inst in trainInsts:
            for w in inst.words:
                if w not in self.word_state:
                    self.word_state[w] = 1
                else:
                    self.word_state[w] += 1

            for l in inst.labels:
                if l not in self.label_state:
                    self.label_state[l] = 1
                else:
                    self.label_state[l] += 1

        print("word state:", len(self.word_state))
        self.addTestAlpha(devInsts)
        print("word state:", len(self.word_state))
        self.addTestAlpha(testInsts)
        print("word state:", len(self.word_state))

        self.word_state[self.hyperParams.unk] = self.hyperParams.wordCutOff + 1
        self.hyperParams.wordAlpha.initial(self.word_state,
                                           self.hyperParams.wordCutOff)
        self.hyperParams.wordAlpha.set_fixed_flag(True)
        self.hyperParams.wordNum = self.hyperParams.wordAlpha.m_size
        self.hyperParams.unkWordID = self.hyperParams.wordAlpha.from_string(
            self.hyperParams.unk)

        self.hyperParams.labelAlpha.initial(self.label_state)
        self.hyperParams.labelAlpha.set_fixed_flag(True)
        self.hyperParams.labelSize = self.hyperParams.labelAlpha.m_size

        print("Label num: ", self.hyperParams.labelSize)
        print("Word num: ", self.hyperParams.wordNum)

    def addTestAlpha(self, insts):
        print("Add test alpha.............")
        if self.hyperParams.wordFineTune == False:
            for inst in insts:
                for w in inst.words:
                    if (w not in self.word_state):
                        self.word_state[w] = 1
                    else:
                        self.word_state[w] += 1

    def extractFeature(self, inst):
        feat = Feature()
        for w in inst.words:
            wordId = self.hyperParams.wordAlpha.from_string(w)
            if wordId == -1:
                feat.wordIndexs.append(self.hyperParams.unkWordID)
            else:
                feat.wordIndexs.append(wordId)
        feat.wordIndexs = torch.autograd.Variable(
            torch.LongTensor(feat.wordIndexs))
        return feat

    def instance2Example(self, insts):
        exams = []
        for inst in insts:
            example = Example()
            example.feat = self.extractFeature(inst)
            for l in inst.labels:
                labelId = self.hyperParams.labelAlpha.from_string(l)
                example.labelIndexs.append(labelId)
            example.labelIndexs = torch.autograd.Variable(
                torch.LongTensor(example.labelIndexs))
            exams.append(example)
        return exams

    def train(self, train_file, dev_file, test_file):
        self.hyperParams.show()
        torch.set_num_threads(self.hyperParams.thread)
        reader = Reader(self.hyperParams.maxInstance)

        trainInsts = reader.readInstances(train_file)
        devInsts = reader.readInstances(dev_file)
        testInsts = reader.readInstances(test_file)
        print("Training Instance: ", len(trainInsts))
        print("Dev Instance: ", len(devInsts))
        print("Test Instance: ", len(testInsts))

        self.createAlphabet(trainInsts, devInsts, testInsts)

        trainExamples = self.instance2Example(trainInsts)
        devExamples = self.instance2Example(devInsts)
        testExamples = self.instance2Example(testInsts)

        self.model = RNNLabeler(self.hyperParams)
        parameters = filter(lambda p: p.requires_grad, self.model.parameters())
        optimizer = torch.optim.Adagrad(parameters,
                                        lr=self.hyperParams.learningRate)

        indexes = []
        for idx in range(len(trainExamples)):
            indexes.append(idx)
        for iter in range(self.hyperParams.maxIter):
            print('###Iteration' + str(iter) + "###")
            random.shuffle(indexes)
            for idx in range(len(trainExamples)):
                self.model.zero_grad()
                self.model.LSTMHidden = self.model.init_hidden()
                exam = trainExamples[indexes[idx]]
                lstm_feats = self.model(exam.feat)
                loss = self.model.crf.neg_log_likelihood(
                    lstm_feats, exam.labelIndexs)
                loss.backward()
                optimizer.step()
                if (idx + 1) % self.hyperParams.verboseIter == 0:
                    print('current: ', idx + 1, ", cost:", loss.data[0])

            eval_dev = Eval()
            for idx in range(len(devExamples)):
                predictLabels = self.predict(devExamples[idx])
                devInsts[idx].evalPRF(predictLabels, eval_dev)
            print('Dev: ', end="")
            eval_dev.getFscore()

            eval_test = Eval()
            for idx in range(len(testExamples)):
                predictLabels = self.predict(testExamples[idx])
                testInsts[idx].evalPRF(predictLabels, eval_test)
            print('Test: ', end="")
            eval_test.getFscore()

    def predict(self, exam):
        tag_hiddens = self.model(exam.feat)
        _, best_path = self.model.crf._viterbi_decode(tag_hiddens)
        predictLabels = []
        for idx in range(len(best_path)):
            predictLabels.append(
                self.hyperParams.labelAlpha.from_id(best_path[idx]))
        return predictLabels

    def getMaxIndex(self, tag_score):
        max = tag_score.data[0]
        maxIndex = 0
        for idx in range(1, self.hyperParams.labelSize):
            if tag_score.data[idx] > max:
                max = tag_score.data[idx]
                maxIndex = idx
        return maxIndex
Example #2
0
class Labeler:
    def __init__(self):
        self.word_state = {}
        self.label_state = {}
        self.hyperParams = HyperParams()
        self.wordAlpha = Alphabet()
        self.labelAlpha = Alphabet()

    def createAlphabet(self, trainInsts):
        for inst in trainInsts:
            for w in inst.words:
                if w not in self.word_state:
                    self.word_state[w] = 1
                else:
                    self.word_state[w] += 1

            for l in inst.labels:
                if l not in self.label_state:
                    self.label_state[l] = 1
                else:
                    self.label_state[l] += 1

        self.wordAlpha.initial(self.word_state, self.hyperParams.wordCutOff)
        self.labelAlpha.initial(self.label_state)

        self.labelAlpha.set_fixed_flag(True)
        self.wordAlpha.set_fixed_flag(True)

        self.hyperParams.wordNum = self.wordAlpha.m_size
        self.hyperParams.labelSize = self.labelAlpha.m_size

        print("word num: ", self.hyperParams.wordNum)
        print("label num: ", self.hyperParams.labelSize)

    def extractFeature(self, inst):
        feat = Feature()
        for w in inst.words:
            wordId = self.wordAlpha.from_string(w)
            feat.wordIndexs.append(wordId)
        feat.wordIndexs = torch.autograd.Variable(
            torch.LongTensor(feat.wordIndexs))
        return feat

    def instance2Example(self, insts):
        exams = []
        for inst in insts:
            example = Example()
            example.feat = self.extractFeature(inst)
            for l in inst.labels:
                labelId = self.labelAlpha.from_string(l)
                example.labelIndexs.append(labelId)
            example.labelIndexs = torch.autograd.Variable(
                torch.LongTensor(example.labelIndexs))
            exams.append(example)
        return exams

    def train(self, train_file, dev_file, test_file):
        self.hyperParams.show()
        torch.set_num_threads(self.hyperParams.thread)
        reader = Reader(self.hyperParams.maxInstance)

        trainInsts = reader.readInstances(train_file)
        devInsts = reader.readInstances(dev_file)

        trainExamples = self.instance2Example(trainInsts)
        devExamples = self.instance2Example(devInsts)

        print("Training Instance: ", len(trainInsts))
        print("Dev Instance: ", len(devInsts))

        self.createAlphabet(trainInsts)

        self.model = RNNLabeler(self.hyperParams)
        optimizer = torch.optim.Adagrad(self.model.parameters(),
                                        lr=self.hyperParams.learningRate)

        indexes = []
        for idx in range(len(trainExamples)):
            indexes.append(idx)

        for iter in range(self.hyperParams.maxIter):
            print('###Iteration' + str(iter) + "###")
            random.shuffle(indexes)
            for idx in range(len(trainExamples)):
                self.model.zero_grad()
                self.model.LSTMHidden = self.model.init_hidden()
                exam = trainExamples[indexes[idx]]
                tag_scores = self.model(exam.feat)
                loss = torch.nn.functional.cross_entropy(
                    tag_scores, exam.labelIndexs)
                loss.backward()
                optimizer.step()
                if (idx + 1) % self.hyperParams.verboseIter == 0:
                    print('current: ', idx + 1, ", cost:", loss.data[0])

            eval_dev = Eval()
            for idx in range(len(devExamples)):
                predictLabels = self.predict(devExamples[idx])
                devInsts[idx].evalPRF(predictLabels, eval_dev)
            eval_dev.getFscore()

    def predict(self, exam):
        tag_scores = self.model(exam.feat)
        if len(tag_scores) != len(exam.labelIndexs) or len(
                tag_scores.data[0]) != self.hyperParams.labelSize:
            print("error")
        predictIndexs = []
        for idx in range(len(tag_scores)):
            pred_idx = self.getMaxIndex(tag_scores[idx])
            predictIndexs.append(pred_idx)
        predictLabels = []
        for idx in range(len(tag_scores)):
            predictLabels.append(self.labelAlpha.from_id(predictIndexs[idx]))
        return predictLabels

    def getMaxIndex(self, tag_score):
        max = tag_score.data[0]
        maxIndex = 0
        for idx in range(1, self.hyperParams.labelSize):
            if tag_score.data[idx] > max:
                max = tag_score.data[idx]
                maxIndex = idx
        return maxIndex
Example #3
0
class Labeler:
    def __init__(self):
        self.word_state = {}
        self.label_state = {}
        self.hyperParams = HyperParams()

    def createAlphabet(self, trainInsts, devInsts, testInsts):
        print("create alpha.................")
        for inst in trainInsts:
            for w in inst.words:
                if w not in self.word_state:
                    self.word_state[w] = 1
                else:
                    self.word_state[w] += 1

            l = inst.label
            if l not in self.label_state:
                self.label_state[l] = 1
            else:
                self.label_state[l] += 1

        print("word state:", len(self.word_state))
        self.addTestAlpha(devInsts)
        print("word state:", len(self.word_state))
        self.addTestAlpha(testInsts)
        print("word state:", len(self.word_state))

        self.word_state[self.hyperParams.unk] = self.hyperParams.wordCutOff + 1
        self.word_state[self.hyperParams.padding] = self.hyperParams.wordCutOff + 1

        self.hyperParams.wordAlpha.initial(self.word_state, self.hyperParams.wordCutOff)
        self.hyperParams.wordAlpha.set_fixed_flag(True)

        self.hyperParams.wordNum = self.hyperParams.wordAlpha.m_size

        self.hyperParams.unkWordID = self.hyperParams.wordAlpha.from_string(self.hyperParams.unk)
        self.hyperParams.paddingID = self.hyperParams.wordAlpha.from_string(self.hyperParams.padding)

        self.hyperParams.labelAlpha.initial(self.label_state)
        self.hyperParams.labelAlpha.set_fixed_flag(True)
        self.hyperParams.labelSize = self.hyperParams.labelAlpha.m_size

        print("Label num: ", self.hyperParams.labelSize)
        print("Word num: ", self.hyperParams.wordNum)
        print("Padding ID: ", self.hyperParams.paddingID)
        print("UNK ID: ", self.hyperParams.unkWordID)

    def addTestAlpha(self, insts):
        print("Add test alpha.............")
        if self.hyperParams.wordFineTune == False:
            for inst in insts:
                for w in inst.words:
                    if (w not in self.word_state):
                        self.word_state[w] = 1
                    else:
                        self.word_state[w] += 1

    def extractFeature(self, inst):
        feat = Feature()
        feat.sentLen = len(inst.words)
        feat.wordIndexs = torch.autograd.Variable(torch.LongTensor(1, feat.sentLen))
        for idx in range(len(inst.words)):
            w = inst.words[idx]
            wordId = self.hyperParams.wordAlpha.from_string(w)
            if wordId == -1:
                wordId = self.hyperParams.unkWordID
            feat.wordIndexs.data[0][idx] = wordId
        return feat

    def instance2Example(self, insts):
        exams = []
        for inst in insts:
            example = Example()
            example.labelIndex = torch.autograd.Variable(torch.LongTensor(1))
            example.feat = self.extractFeature(inst)
            l = inst.label
            labelId = self.hyperParams.labelAlpha.from_string(l)
            example.labelIndex.data[0] = labelId
            exams.append(example)
        return exams

    def getBatchFeatLabel(self, exams):
        maxSentSize = 0
        for e in exams:
            if maxSentSize < e.feat.sentLen:
                maxSentSize = e.feat.sentLen
        if maxSentSize > 40:
            maxSentSize = 40
        batch_feats = torch.autograd.Variable(torch.LongTensor(self.hyperParams.batch, maxSentSize))
        batch_labels = torch.autograd.Variable(torch.LongTensor(self.hyperParams.batch))

        for idx in range(len(batch_feats.data)):
            e = exams[idx]
            batch_labels.data[idx] = e.labelIndex.data[0]
            for idy in range(maxSentSize):
                if idy < e.feat.sentLen:
                    batch_feats.data[idx][idy] = e.feat.wordIndexs.data[0][idy]
                else:
                    batch_feats.data[idx][idy] = self.hyperParams.paddingID
        return batch_feats, batch_labels

    def train(self, train_file, dev_file, test_file):
        self.hyperParams.show()
        torch.set_num_threads(self.hyperParams.thread)
        reader = Reader()

        trainInsts = reader.readInstances(train_file, self.hyperParams.maxInstance)
        devInsts = reader.readInstances(dev_file, self.hyperParams.maxInstance)
        testInsts = reader.readInstances(test_file, self.hyperParams.maxInstance)

        print("Training Instance: ", len(trainInsts))
        print("Dev Instance: ", len(devInsts))
        print("Test Instance: ", len(testInsts))

        self.createAlphabet(trainInsts, devInsts, testInsts)

        trainExamples = self.instance2Example(trainInsts)
        devExamples = self.instance2Example(devInsts)
        testExamples = self.instance2Example(testInsts)

        self.model = RNNLabeler(self.hyperParams)
        parameters = filter(lambda p: p.requires_grad, self.model.parameters())
        optimizer = torch.optim.Adam(parameters, lr=self.hyperParams.learningRate)

        indexes = []
        for idx in range(len(trainExamples)):
            indexes.append(idx)

        batchBlock = len(trainExamples) // self.hyperParams.batch
        for iter in range(self.hyperParams.maxIter):
            print('###Iteration' + str(iter) + "###")
            random.shuffle(indexes)
            self.model.train()
            for updateIter in range(batchBlock):
                #self.model.zero_grad()
                optimizer.zero_grad()
                exams = []
                start_pos = updateIter * self.hyperParams.batch
                end_pos = (updateIter + 1) * self.hyperParams.batch
                for idx in range(start_pos, end_pos):
                    exams.append(trainExamples[indexes[idx]])
                feats, labels = self.getBatchFeatLabel(exams)
                output = self.model(feats, self.hyperParams.batch)
                loss = torch.nn.functional.cross_entropy(output, labels)
                loss.backward()
                optimizer.step()
                if (updateIter + 1) % self.hyperParams.verboseIter == 0:
                    print('current: ', idx + 1, ", cost:", loss.data[0])

            self.model.eval()
            eval_dev = Eval()
            for idx in range(len(devExamples)):
                predictLabel = self.predict(devExamples[idx])
                devInsts[idx].evalACC(predictLabel, eval_dev)
            print("dev: ", end='')
            eval_dev.getACC()

            eval_test = Eval()
            for idx in range(len(testExamples)):
                predictLabel = self.predict(testExamples[idx])
                testInsts[idx].evalACC(predictLabel, eval_test)
            print("test: ", end='')
            eval_test.getACC()

    def predict(self, exam):
        output = self.model(exam.feat.wordIndexs)
        labelID = self.getMaxIndex(output)
        return self.hyperParams.labelAlpha.from_id(labelID)

    def getMaxIndex(self, tag_score):
        max = tag_score.data[0][0]
        maxIndex = 0
        for idx in range(1, self.hyperParams.labelSize):
            if tag_score.data[0][idx] > max:
                max = tag_score.data[0][idx]
                maxIndex = idx
        return maxIndex
class Labeler:
    def __init__(self):
        self.hyperParams=HyperParams()
        pass
    def train(self,trainFile,devFile,testFile):
        self.hyperParams.show()
        train=DataSet(trainFile)
        dev=DataSet(devFile)
        test=DataSet(testFile)
        
        vocab=Alphabet(train)
        self.hyperParams.vocabSize=vocab.size()
        self.hyperParams.labelSize=vocab.label_size()
        print('vocab_size:',self.hyperParams.vocabSize)
        print('label_size:',self.hyperParams.labelSize)
        
        train=IndexSet(train,vocab)
        dev=IndexSet(dev,vocab)
        test=IndexSet(test,vocab)
        
        print('trainset_size: ',train.size())
        print('devset_size: ',dev.size())
        print('testset_size: ',test.size())
        if self.hyperParams.embedFile!='':
            pretrain=PretrainEmb(self.hyperParams.embedFile,vocab.word2id)
        else:
            pretrain=None
            
        ##############################            
        self.model = Encoder(self.hyperParams,pretrain) #encoder
        self.crf = CRF(self.hyperParams.labelSize,vocab.label2id['<start>'],vocab.label2id['<padding>'],vocab)#decoder
        
        parameters = filter(lambda p: p.requires_grad, self.model.parameters())
        
        optimizer_rnn = torch.optim.Adam(parameters, lr = self.hyperParams.learningRate)
        optimizer_crf = torch.optim.Adam(self.crf.parameters(), lr = self.hyperParams.learningRate)
        ##############################
        
        indexes = []
        for idx in range(train.size()):
            indexes.append(idx)
    
        batchBlock = len(train.word_mat) // self.hyperParams.batch
        for iter in range(self.hyperParams.maxIter):#################
            print('###Iteration' + str(iter) + "###")
            random.shuffle(indexes)
            
            self.model.train()###
            
            for updateIter in range(batchBlock):
                #self.model.zero_grad()
                optimizer_rnn.zero_grad()
                optimizer_crf.zero_grad()
    
                start_pos = updateIter * self.hyperParams.batch
                end_pos = (updateIter + 1) * self.hyperParams.batch
                feats=[]
                labels=[]
                for idx in range(start_pos, end_pos):
                    feats.append(train.word_mat[indexes[idx]])
                    labels.append(train.label_mat[indexes[idx]])
                batch=BatchBucket(len(feats),self.hyperParams.maxSentSize,feats,labels,vocab.word2id['<padding>'],vocab.label2id['<padding>'])
                tag_scores = self.model(batch.batch_words, self.hyperParams.batch)
                #print(tag_scores.size())
                loss = self.crf.neg_log_likelihood(tag_scores, batch.batch_labels,batch.masks)
                loss.backward()
    
                optimizer_rnn.step()
                optimizer_crf.step()
    
                if (updateIter + 1) % self.hyperParams.verboseIter == 0:
                    print('current: ', idx + 1, ", cost:", loss.data[0])
    
            self.model.eval()###
            self.eval_predict(dev,vocab)
            self.eval_predict(test,vocab)
            
    def eval_predict(self,indexset,vocab) :
        correct_num=0
        total_num=0
        batchBlock=len(indexset.label_mat)//self.hyperParams.batch
        for updateIter in range(batchBlock):
           
            start_pos = updateIter * self.hyperParams.batch
            end_pos = (updateIter + 1) * self.hyperParams.batch
            feats=[]
            labels=[]
            for idx in range(start_pos, end_pos):
                feats.append(indexset.word_mat[idx])
                labels.append(indexset.label_mat[idx])
            batch=BatchBucket(len(feats),self.hyperParams.maxSentSize,feats,labels,vocab.word2id['<padding>'],vocab.label2id['<padding>'])
            tag_scores = self.model(batch.batch_words, self.hyperParams.batch)
            predict_labels=self.crf.viterbi_decode(tag_scores,batch.masks)
            predict_labels=predict_labels.masked_select(batch.masks)
            gold_labels=batch.batch_labels.masked_select(batch.masks)
            correct_num+=torch.sum(torch.gt(predict_labels.float(),gold_labels.float())).data[0]
            total_num+=torch.sum(batch.masks).data[0]
        
        rate=correct_num/total_num
        print('total_num: {} , correct_num: {}'.format(total_num,correct_num))
        print('rate: ',rate)