Esempio n. 1
0
    def train(self,N=1,k=1):
        from dig_parser import DIGParser
        self._dig_parser = DIGParser()
        Q = spdiag([2]*self.features_size)
        G = spmatrix(0,[0],[0],(k,self.features_size))
        Wc = spmatrix(0,[0],[0],(self.features_size,1))
        self.v = spmatrix(0,[0],[0],(self.features_size,1))
        complete = N * len(self._training_data)
        round = 0
        for loop in range(N):
            for tid,(W,T,A,H) in enumerate(self._training_data):
                round += 1
                print 'round %d/%d'%(round,complete)
                F = self.features_instance[tid]
                gold_deps,gold_features = [],Features()
                for c,h in enumerate(H):
                    if h != -1:
                        gold_deps.append((h,c))
                        if h > c:
                            gold_features += F['%d-%d-L'%(c,h)]
                        else:
                            gold_features += F['%d-%d-R'%(h,c)]
                gold_deps.sort()

                print 'creating score table...'
                Sc = self._create_Sc(W,T,F,Wc)

                text = []
                for i in range(1,len(W),1):
                    text.append('%s/%s'%(W[i],T[i]))
                text = ' '.join(text)
                print 'input length = %d'%(len(W))
                print 'parsing %d best...'%(k)
                trees = self._dig_parser.parse_mira_tagged_text(text,Sc,k,format='naist')

                n_trees = len(trees)
                if n_trees < k:
                    for j in range(k-n_trees):
                        trees.append(trees[-1])
                        
                print 'updating weight...'
                Wc = self._update_weight(Q,G,Wc,F,gold_deps,gold_features,trees)
                self.v = self.v + Wc
                print 'done\n'
Esempio n. 2
0
class MIRA:
    def __init__(self,mode='normal'):
        self._training_data = []
        self._f_model = FeaturesModel(mode)
        self.features_size = 0
        self.v = None

    def save_model(self,file_name='mira.model'):
        import bz2,struct
        output = bz2.BZ2File(file_name+'.f','w')
        for k,v in self._f_model.training_features.table.items():
            output.write('%s %d\n'%(k,v)) 
        output.close()

        norm = 0
        for i in range(self.features_size):
            norm += math.pow(self.v[i],2)
        norm = math.sqrt(norm)

        output = open(file_name+'.b','wb')
        for i in range(self.features_size):
            output.write(struct.pack('f',self.v[i]/norm))
        output.close

    def load_model(self,file_name):
        import bz2,struct
        sys.__stderr__.write('Loading model...\n')
        input = bz2.BZ2File(file_name+'.f','r')
        for line in input.readlines():
            k,v = line.strip().split()
            self._f_model.training_features.table[k] = int(v)
        input.close()
        self._f_model.training_features.fid = len(self._f_model.training_features.table)

        self.features_size = self._f_model.get_size_of_features()
        sys.__stderr__.write('%d features are loaded\n'%(self.features_size))

        input = open(file_name+'.b','rb')
        w = []
        while True:
            f = input.read(4)
            if not f:
                break
            w.append(struct.unpack('f',f)[0])
        input.close()

        self.v = spmatrix(0,[0],[0],(len(w),1))
        for i,f in enumerate(w):
            if f != 0:
                self.v[i] = f
        sys.__stderr__.write('weight vector (%d) are loaded\n'%(len(w)))

    def read_training_file(self,input_file):
        lines = open(input_file).readlines()
        print 'Reading Training Data...'
        total = 0
        # save words and tags
        tmp = []
        for line in lines:
            if line.strip() != '':
                tmp.append(line.strip())
            else:
                W = tmp[0].strip().split('\t')
                if len(W) < 2:
                    tmp = []
                    continue
                W = ['<root>'] + map(lambda x: x.strip().replace(' ','_'),W)
                T = ['<root-POS>'] + tmp[1].strip().split('\t')
                A = ['<no-type>'] + tmp[2].strip().split('\t')
                H = [-1] + map(int,tmp[3].strip().split('\t'))

                for i,w in enumerate(W):
                    if T[i] == 'npn':
                        W[i] = '<npn>'
                    else:
                        W[i] = encode_number(w)

                self._training_data.append((W,T,A,H))
                b_table = create_between_pos_table(T)
                total += 1
                if total % 10 == 0: print total

                for j,i in enumerate(H):
                    if i == -1: continue
                    if i > j:
                        self._f_model.insert_data(j,i,W,T,'L',b_table)
                    else:
                        self._f_model.insert_data(i,j,W,T,'R',b_table)

                tmp = []
        self.features_size = self._f_model.get_size_of_features()
        print total
        print 'Number of Features:',self.features_size

        self._create_features_instance()

    def _create_F(self,W,T):
        b_table = create_between_pos_table(T)
        F = {}
        for i in range(len(W)):
            for j in range(i+1,len(W),1):
                for di in ['L','R']:
                    F['%d-%d-%s'%(i,j,di)] = self._f_model.convert_data(i,j,W,T,di,b_table)
        return F

    def _extract_features(self,s,t,di,tokens,b_table):
        W,T = [],[]
        for w,p in tokens:
            W.append(w)
            T.append(p)
        return self._f_model.convert_data(s,t,W,T,di,b_table)

    def _create_Sc(self,W,T,F,Wc):
        Sc = {}
        for i in range(len(W)):
            for j in range(i+1,len(W),1):
                for di in ['L','R']:
                    Sc['%d-%d-%s'%(i,j,di)] = F['%d-%d-%s'%(i,j,di)].prod(Wc)
                    #print i,j,di,W[i],T[i],W[j],T[j],Sc['%d-%d-%s'%(i,j,di)]
        return Sc

    def _create_features_instance(self):
        self.features_instance = []
        i = 0
        for W,T,A,H in self._training_data:
            F = self._create_F(W,T)
            self.features_instance.append(F)
            i += 1
            print 'Create Features Instance: %d'%(i)

    def train(self,N=1,k=1):
        from dig_parser import DIGParser
        self._dig_parser = DIGParser()
        Q = spdiag([2]*self.features_size)
        G = spmatrix(0,[0],[0],(k,self.features_size))
        Wc = spmatrix(0,[0],[0],(self.features_size,1))
        self.v = spmatrix(0,[0],[0],(self.features_size,1))
        complete = N * len(self._training_data)
        round = 0
        for loop in range(N):
            for tid,(W,T,A,H) in enumerate(self._training_data):
                round += 1
                print 'round %d/%d'%(round,complete)
                F = self.features_instance[tid]
                gold_deps,gold_features = [],Features()
                for c,h in enumerate(H):
                    if h != -1:
                        gold_deps.append((h,c))
                        if h > c:
                            gold_features += F['%d-%d-L'%(c,h)]
                        else:
                            gold_features += F['%d-%d-R'%(h,c)]
                gold_deps.sort()

                print 'creating score table...'
                Sc = self._create_Sc(W,T,F,Wc)

                text = []
                for i in range(1,len(W),1):
                    text.append('%s/%s'%(W[i],T[i]))
                text = ' '.join(text)
                print 'input length = %d'%(len(W))
                print 'parsing %d best...'%(k)
                trees = self._dig_parser.parse_mira_tagged_text(text,Sc,k,format='naist')

                n_trees = len(trees)
                if n_trees < k:
                    for j in range(k-n_trees):
                        trees.append(trees[-1])
                        
                print 'updating weight...'
                Wc = self._update_weight(Q,G,Wc,F,gold_deps,gold_features,trees)
                self.v = self.v + Wc
                print 'done\n'

    def _loss_score(self,t1,t2):
        if len(t1) != len(t2):
            print 'error: two trees mismatch'
            sys.exit(1)

        return 1.0*len(set(t2)-set(t1))

    def _update_weight(self,Q,G,Wc,F,gold_deps,gold_features,trees):
        constraints,L = [],[]
        print 'creating constraints...'
        for i,tree in enumerate(trees):
            print '\tfor tree %d'%(i)
            deps = map(lambda x:(x[0],x[2]), tree)
            deps.sort()
            L += [-self._loss_score(gold_deps,deps)]
            fc = []
            for s,x,t in tree:
                if s > t:
                    fc += F['%d-%d-L'%(t,s)]
                else:
                    fc += F['%d-%d-R'%(s,t)]
                    
            fg = [] 
            for f in gold_features:
                if f in fc:
                    fc.remove(f)
                else:
                    fg.append(f)
            constraints += map(lambda x:(fg.count(x),i,x),fg)
            constraints += map(lambda x:(-fc.count(x),i,x),fc)
        
        print 'preparing parameters...'
        G = sparse(G*0)
        for s,i,j in constraints:
            G[(i,j)] = -s
        h = matrix(L)
        p = matrix(-2*Wc)
        sol = solvers.qp(Q,p,G,h)
        Wn = sparse(sol['x'])
        print L
        sg = gold_features.prod(Wn)
        for tree in trees:
            fc = Features()
            for s,x,t in tree:
                if s > t:
                    fc += F['%d-%d-L'%(t,s)]
                else:
                    fc += F['%d-%d-R'%(s,t)]
            sc = fc.prod(Wn)
            print sg,'-',sc,'=',sg-sc

        return Wn