def __init__(self, tgt_vocab, args, tgt_w2v):
        super(GRUDecoder, self).__init__()
        with self.init_scope():
            self.word2embedding = chainLinks.EmbedID(tgt_vocab.size,
                                                     args.edim,
                                                     ignore_label=-1)
            self.gru = BahadanauGRU(args.edim, args.nhid)
            self.U_o = chainLinks.Linear(args.nhid, args.nhid)
            self.V_o = chainLinks.Linear(args.edim, args.nhid)
            self.C_o = chainLinks.Linear(2 * args.nhid, args.nhid)
            self.W_o = chainLinks.Linear(args.nhid // 2, tgt_vocab.size)
            self.attention = additiveAttention(args.nhid)

        if tgt_w2v is not None:
            for i in range(tgt_vocab.size):
                word = tgt_vocab.id2word[i]
                if word in tgt_w2v:
                    self.word2embedding.W.data[i] = tgt_w2v[word]
        self.vocab_size = tgt_vocab.size
        self.embedding_size = args.edim
        self.hidden_size = args.nhid
        self.gen_limit = args.genlimit
        if args.useDropout:
            self.use_dropout = args.useDropout
            self.dropoutr = args.dlr
        else:
            self.use_dropout = None
            self.dropoutr = 0
            util.trace('{}'.format(chainer.global_config.__dict__))
    def __init__(self, src_vocab, args, src_w2v):
        super(biGRU_encoder, self).__init__()
        with self.init_scope():
            self.word2embed = chainLinks.EmbedID(src_vocab.size,
                                                 args.edim,
                                                 ignore_label=-1)
            self.embed2hiddenf = chainLinks.GRU(args.edim, args.nhid)
            self.embed2hiddenb = chainLinks.GRU(args.edim, args.nhid)

        #embedding weight is intialized by w2v.
        if src_w2v is not None:
            for i in range(src_vocab.size):
                word = src_vocab.id2word[i]
                if word in src_w2v:
                    self.word2embed.W.data[i] = src_w2v[word]
        self.vocab_size = src_vocab.size
        self.embedding_size = args.edim
        self.hidden_size = args.nhid
        if args.useDropout:
            self.use_dropout = args.useDropout
            self.dropoutr = args.dlr
        else:
            self.use_dropout = None
            self.dropoutr = 0
            util.trace('{}'.format(chainer.global_config.__dict__))
Esempio n. 3
0
def make_matrix(co_occur, word_occur, context_occur, model, th=1):
    """
    this function is making the matrix expresses word-graph.
    word_name: word to id
    num_pairs: sum of the all co-occurrence
    feature_list: a list the elemint of which is dict (key; co-occurrence, value: occurrence or pmi).
    co_word_dict: key is a word, values are the co-occurrence words around the word.
    word_dict: key is the co-occurrence word, value is occurrence (Freq) or PPMI value (PPMI).
    """
    vectorizer = sklearn.feature_extraction.DictVectorizer()
    word_name = dict()
    feature_list = list()
    co_word_dict = collections.defaultdict(list)
    num_pairs = 0

    for value in co_occur.values():
        num_pairs += value

    for word in co_occur.keys():
        w, c = word.split('\t')

        # if only once appearance (or under threshold) of the co-occurrence
        # the matrix element is 0.
        if co_occur[word] <= th:
            c = c+'@@@@@None'
        co_word_dict[w].append(c)

    counter = 0

    # caluculation of the matrix element
    for word, co_words in sorted(co_word_dict.items()):
        if counter % 10000 == 0:
            util.trace('add word num :'+str(counter))
        
        word_name[word] = counter
        word_dict = dict()

        for c in co_words:
            if c.find('@@@@@None') != -1:
                word_dict[c.rsplit('@@@@@None')[0]] = 0
            else:
                if model == 'Freq':
                    word_dict[c] = co_occur['{}\t{}'.format(word, c)]
                elif model == 'PPMI':
                    pmi_value = returnPmi(word, c, co_occur, word_occur, context_occur, num_pairs)
                    pmi_value += np.log2(co_occur['{}\t{}'.format(word, c)])
                    
                    word_dict[c] = max(0, pmi_value)
        
        feature_list.append(word_dict)
        counter += 1
    
    matrix = (word_name, vectorizer.fit_transform(feature_list))

    return matrix, vectorizer
Esempio n. 4
0
    def __init__(self, src_vocab, tgt_vocab, args):
        super().__init__()
        with self.init_scope():
            self.word2embed = chainLinks.EmbedID(src_vocab.size,
                                                 args.nhid,
                                                 ignore_label=-1)
            # W_1, U
            self.embed2hidden = chainLinks.GRU(args.nhid, args.nhid)
            # W_2
            self.W_2 = chainLinks.Linear(args.nhid, tgt_vocab.size)

        self.vocab_size = src_vocab.size
        self.hidden_size = args.nhid
        if args.useDropout:
            self.use_dropout = args.useDropout
            self.dropoutr = args.dlr
        else:
            self.use_dropout = None
            self.dropoutr = 0
            util.trace('{}'.format(chainer.global_config.__dict__))
Esempio n. 5
0
def extract_context(input_file, window_size):
    """
    this function is counting the co-occurrence in the words at the input file.
    co_occur: one word to co-occurrence word
    word_occur: occur word (as token)
    context_occur: co-occurrence word
    This function is a very naive implementation; I want to devise a little more.
    """

    co_occur = collections.defaultdict(int)
    word_occur = collections.defaultdict(int)
    context_occur = collections.defaultdict(int)

    i_f = open(input_file)
    
    for num, line in enumerate(i_f):
        if num % 10000 == 0:
            util.trace('look at sent: '+str(num))
        words = line.strip().split()

        for i in range(len(words)):
            # left word  
            for j in range(1, window_size+1):
                if i - j > -1:
                    co_occur['{}\t{}'.format(words[i], words[i-j])] += 1
                    word_occur[words[i]] += 1
                    context_occur[words[i-j]] += 1
                else:
                    continue
            
            #right word
            for j in range(1, window_size+1):
                if i + j < len(words):
                    co_occur['{}\t{}'.format(words[i], words[i+j])] += 1
                    word_occur[words[i]] += 1
                    context_occur[words[i+j]] += 1
                else:
                    continue
    i_f.close()
    
    return co_occur, word_occur, context_occur
def test(args):
    src_word2vec = None
    tgt_word2vec = None
    util.trace('start testing ...')
    corpus_file = args.vocabdir + '/test.' + args.sourcelang
    output_file = args.savedir + '/{}.generate.{}-{}'.format(
        args.name, args.sourcelang, args.targetlang)
    args.name = args.datadir + '/{}.{:03d}'.format(args.name, args.epochNum)

    chainer.global_config.train = False
    #chainer.global_config.debug = True
    util.trace('chainer config: {}'.format(chainer.global_config.__dict__))

    util.trace('load vocab...')
    vocab_file = args.vocabdir + '/vocabulary.'
    source_vocab = util.Vocabulary.load(vocab_file + args.sourcelang)
    target_vocab = util.Vocabulary.load(vocab_file + args.targetlang)

    util.trace('Loading Model ...')

    NMTmodel = nmt_model.BahdanauNMT(source_vocab, target_vocab, args,
                                     src_word2vec, tgt_word2vec)

    if args.useGPU >= 0:
        import cupy as xp
        chainer.cuda.check_cuda_available()
        chainer.cuda.get_device(args.useGPU).use()
        NMTmodel.to_gpu()
        util.trace('use GPU id: {}'.format(args.useGPU))
    else:
        import numpy as xp
        args.useGPU = -1
        util.trace('without GPU')

    chainer.serializers.load_npz('{}.weights'.format(args.name), NMTmodel)

    util.trace('Generating translation ...')
    finished = 0

    with open(output_file, 'w') as o_f:
        for src_sent in util.monoBatch(corpus_file, source_vocab, args):
            util.trace('Sample {} ...'.format(finished + 1))
            prds = NMTmodel.generate(src_sent)
            for predict in util.convert_b2w(prds, target_vocab):
                o_f.write('{}\n'.format(predict))
                finished += 1
                                     epilog='end')
    parser.add_argument('sourcelang')
    parser.add_argument('targetlang')
    parser.add_argument('-datadir',
                        help='saved the weight file in this folder',
                        default='')
    parser.add_argument('-vocabdir',
                        help='saved the vocab file in this folder',
                        default='')
    parser.add_argument('-savedir',
                        help='save the output sentence in this folder',
                        default='')
    parser.add_argument('-useGPU', type=int, default=-1)
    parser.add_argument('-epochNum',
                        type=int,
                        help='point at the model you want to use')
    parser.add_argument('-genlimit', type=int, help='generation limit')
    parser.add_argument('-name', default='sample', help='model name')
    parser.add_argument('-edim',
                        default=512,
                        type=int,
                        help='embedding size for model')
    parser.add_argument('-nhid',
                        default=1024,
                        type=int,
                        help='hidden size for model')
    parser.add_argument('-useDropout', action='store_true')
    args = parser.parse_args()
    test(args)
    util.trace('finish generation')
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser(usage='sorry, look at readme...', \
            description='arg description', epilog='end')
    parser.add_argument('inputF',
                        help='write the file name of the input text.')
    parser.add_argument('-model',
                        help='select Freq or PPMI.',
                        default='PPMI',
                        choices=['Freq', 'PPMI'])
    parser.add_argument('-outF',
                        help='write the output file name.',
                        default='sample')
    parser.add_argument('-window',
                        help='define the window size.',
                        type=int,
                        default=2)
    parser.add_argument('-iter',
                        help='the number of HITS iteration.',
                        type=int,
                        default=300)
    parser.add_argument('-vocabSize',
                        help='define the vocabulary size. default is all.',
                        type=int,
                        default=None)
    args = parser.parse_args()

    # counting co-occurrence
    util.trace('count the co-occurrence')
    co_occur, word_occur, context_occur = word_graph.extract_context(
        args.inputF, args.window)

    util.trace('vocabulary size of the input data is {}.'.format(
        len(word_occur)))
    if args.vocabSize:
        vocabSize = args.vocabSize
    else:
        vocabSize = len(word_occur)

    # calculate matrix
    util.trace('make matrix (word-graph)')
    matrix, vec = word_graph.make_matrix(co_occur, word_occur, context_occur,
                                         args.model)

    # save data (matrix)
    util.trace('save the matrix')
    util.save_data(matrix,
                   args.outF + '/pmi_matrix_{}.pickle'.format(args.model))
    util.save_data(vec,
                   args.outF + '/pmi_vectorizer_{}.pickle'.format(args.model))

    # get the intial vector
    HITS_obj = hits.HITS(matrix)

    # matrix is symmetry; authority score is equal to hubness score.
    util.trace('start HITS')
    i = HITS_obj.startHITS(args.iter).toarray()
    util.trace('finish HITS')

    # write the ranking words by HITS
    util.trace('write the vocabulary')
    util.writeVocab(HITS_obj, i, vocabSize, args.outF + '/vocab_file.hits')

    util.trace('finish program')
Esempio n. 9
0
def train(args):
    start_epoch = 0
    corpus_file = args.datadir+'/train.'

    util.trace('start training...')

    chainer.global_config.train = True
    chainer.global_config.use_cudnn = 'always'
    chainer.global_config.type_check = True
    util.trace('chainer config: {}'.format(chainer.global_config.__dict__))

    util.trace('load vocab...')
    vocab_file = args.datadir+'/vocabulary.'
    source_vocab = util.Vocabulary.load(vocab_file+args.sourcelang)
    target_vocab = util.Vocabulary.load(vocab_file+args.targetlang)
    """
    util.trace('make vocab...')
    source_vocab = util.Vocabulary.make(corpus_file+args.sourcelang, 3000)
    target_vocab = util.Vocabulary.make(corpus_file+args.targetlang, 3000)
    """

    if args.gensim_mode == 'make':
        util.trace('making word2vec...')
        src_word2vec = util.make_word2vec(corpus_file+args.sourcelang, args.edim)
        tgt_word2vec = util.make_word2vec(corpus_file+args.targetlang, args.edim)
        util.save(src_word2vec, args.datadir+'/src_word2vec.'+args.sourcelang)
        util.save(tgt_word2vec, args.datadir+'/tgt_word2vec.'+args.targetlang)
    elif args.gensim_mode == 'load':
        util.trace('loading word2vec...')
        src_word2vec = util.load_word2vec(args.datadir+'/src_word2vec.'+args.sourcelang)
        tgt_word2vec = util.load_word2vec(args.datadir+'/tgt_word2vec.'+args.targetlang)
    elif args.gensim_mode == 'not':
        util.trace('do not use word2vec')
        src_word2vec = None
        tgt_word2vec = None
    
    util.trace('making model...')
    #initialize model
    NMTmodel = nmt_model.BahdanauNMT(source_vocab, target_vocab, args, src_word2vec, tgt_word2vec)

    if args.gpunum >= 0:
        import cupy as xp
        chainer.cuda.check_cuda_available() 
        chainer.cuda.get_device(args.gpunum).use()
        NMTmodel.to_gpu()
        util.trace('use GPU id: {}'.format(args.gpunum))
    else:
        import numpy as xp
        args.gpunum = -1
        util.trace('without GPU')
    
    util.trace('random seed: {}'.format(args.seed_num))
    np.random.seed(args.seed_num)
    xp.random.seed(args.seed_num)
    random.seed(args.seed_num)

    optim = args.optim
    #this is change
    optim = chainer.optimizers.AdaGrad(lr=args.lr)
    optim.setup(NMTmodel)
    optim.add_hook(chainer.optimizer.GradientClipping(args.grad_clip))

    for epoch in range(start_epoch, args.epoch):
        util.trace('Epoch {}/{}'.format(epoch+1, args.epoch))
        accum_loss = 0.0
        num_sent = 0
        for batch_src, batch_tgt in util.miniBatch(corpus_file+args.sourcelang, corpus_file+args.targetlang,\
                                    source_vocab, target_vocab, args.batch, args.pooling):
            NMTmodel.zerograds()
            loss, batch_hyp = NMTmodel(batch_src, batch_tgt)
            accum_loss += loss.data
            loss.backward()
            optim.update()

            for src, tgt, hyp in zip(util.convert_b2w(batch_src, source_vocab), util.convert_b2w(batch_tgt, target_vocab), \
                util.convert_b2w(batch_hyp, target_vocab)):
                util.trace('Epoch {}/{}, {} sent'.format(epoch+1, args.epoch, num_sent+1))
                util.trace('src: {}'.format(src))
                util.trace('tgt: {}'.format(tgt))
                util.trace('hyp: {}'.format(hyp))
                num_sent += 1
        util.trace('accum_loss: {}'.format(accum_loss))
        util.trace('Save model ...')
        model_name = '{}.{:03d}'.format(args.name, epoch+1)
        chainer.serializers.save_npz(args.savedir+'/{}.weights'.format(model_name), NMTmodel)
        chainer.serializers.save_npz(args.savedir+'/{}.optimizer'.format(model_name), optim)
Esempio n. 10
0
    parser.add_argument('sourcelang')
    parser.add_argument('targetlang')
    parser.add_argument('-datadir', help='data directory to use corpus and vocab', default='')
    parser.add_argument('-savedir', help='save directory for weight', default='')
    #parser.add_argument('-model', help='model for neural MT', default='bahdanau')
    parser.add_argument('-edim', help='embedding size for model', type=int, default=512)
    parser.add_argument('-nhid', help='hidden size for model', type=int, default=512)
    parser.add_argument('-gensim_mode', help='use gensim for embedding, make, load, or not?', default='not', choices=['make', 'load', 'not'])
    #parser.add_argument('-gensimfileS', help='gensim file for source'. default='')
    #parser.add_argument('-gensimfileT', help='gensim file for target'. default='')
    #parser.add_argument('-nlayer', help='hidden layer for model, attention: 1layer using gensim is 2layer without gensim'. type=int, default=2)
    parser.add_argument('-optim', help='select optimizer', default='AdaGrad')
    parser.add_argument('-lr', help='learning rate for optimizer', type=float, default=0.01)
    parser.add_argument('-gpunum', help='GPU number (negative value is using CPU)', type=int, default=-1)
    parser.add_argument('-epoch', help='max epoch during training', type=int, default=50)
    parser.add_argument('-useDropout', help='max epoch during training', action='store_true')
    parser.add_argument('-dlr', help='dropout rate', type=float, default=0.2)
    parser.add_argument('-batch', help='batch size', type=int, default=100)
    parser.add_argument('-pooling', help='pooling size', type=int, default=100)
    parser.add_argument('-genlimit', help='generation limit', type=int, default=60)
    #parser.add_argument('-useBeam', help='use beamsearch or not?', action='store_true')
    #parser.add_argument('-beamsize', help='beam size', type=int, default=2)
    parser.add_argument('-grad_clip', help='gradient cliping', type=float, default=5.0)
    parser.add_argument('-useSeed', help='use random seed or not?', action='store_true')
    parser.add_argument('-seed_num', help='random seed number', type=int, default=2434)
    parser.add_argument('-name', help='model name, default is "sample"', default='sample') 
    args = parser.parse_args()
    
    train(args)
    util.trace('finish training!')
Esempio n. 11
0
    tgt_vocab = util.Vocabulary.make(args.savedir+'/train.'+args.targetlang, args.tgtvocab_size)
    #tgt_vocab = util.Vocabulary.make(tgt_corpus, args.tgtvocab_size)
    src_vocab_file = args.savedir+'/vocabulary.'+args.sourcelang
    tgt_vocab_file = args.savedir+'/vocabulary.'+args.targetlang
    src_vocab.save(src_vocab_file)
    tgt_vocab.save(tgt_vocab_file)
    #word2vec
    if args.word2vec:
        #default; worker is 5.
        src_w2v=util.make_word2vec(args.savedir+'/train.'+args.sourcelang, args.edim)
        tgt_w2v=util.make_word2vec(args.savedir+'/train.'+args.targetlang, args.edim)
        util.save_word2vec(src_w2v, args.savedir+'/src_word2vec.'+args.sourcelang)
        util.save_word2vec(tgt_w2v, args.savedir+'/tgt_word2vec.'+args.targetlang)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(usage='sorry, see the readme.', description='arg description', epilog='end')
    parser.add_argument('sourcelang')
    parser.add_argument('targetlang')
    parser.add_argument('-srcfile', help='source corpus file path')
    parser.add_argument('-tgtfile', help='source corpus file path')
    parser.add_argument('-savedir', help='save directory for training corpus')
    parser.add_argument('-word2vec', help='use word2vec or not', action='store_true')
    parser.add_argument('-edim', help='embedding size', type=int)
    parser.add_argument('-srcvocab_size', help='vocabulary size for src', type=int)
    parser.add_argument('-tgtvocab_size', help='vocabulary size for tgt', type=int)
    #parser.add_argument('-vocab_thred', help='under threthold, not use word for vocabulary', type=int)
    args = parser.parse_args()
    
    preprocess(args)
    util.trace('finish preprocess')