Ejemplo n.º 1
0
    def __init__(self, args):
        self.args = args
        self.state = prototype_state()
        with open(self.args.state) as src:
            self.state.update(cPickle.load(src))
        self.state.update(eval("dict({})".format(self.args.changes)))

        logging.basicConfig(level=getattr(logging, self.state['level']),
                            format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

        rng = numpy.random.RandomState(self.state['seed'])
        enc_dec = RNNEncoderDecoder(self.state, rng, skip_init=True)
        enc_dec.build()
        self.lm_model = enc_dec.create_lm_model()
        self.lm_model.load(self.args.model_path)
        self.indx_word = cPickle.load(open(self.state['word_indx'], 'rb'))

        self.sampler = None
        self.beam_search = None
        if self.args.beam_search:
            self.beam_search = BeamSearch(enc_dec)
            self.beam_search.compile()
        else:
            self.sampler = enc_dec.create_sampler(many_samples=True)

        self.idict_src = cPickle.load(open(self.state['indx_word'], 'r'))
Ejemplo n.º 2
0
    def __init__(self, args):
        self.args = args
        self.state = prototype_state()
        with open(self.args.state) as src:
            self.state.update(cPickle.load(src))
        self.state.update(eval("dict({})".format(self.args.changes)))

        logging.basicConfig(level=getattr(logging, self.state['level']),
                            format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

        rng = numpy.random.RandomState(self.state['seed'])
        enc_dec = RNNEncoderDecoder(self.state, rng, skip_init=True)
        enc_dec.build()
        self.lm_model = enc_dec.create_lm_model()
        self.lm_model.load(self.args.model_path)
        self.indx_word = cPickle.load(open(self.state['word_indx'], 'rb'))

        self.sampler = None
        self.beam_search = None
        if self.args.beam_search:
            self.beam_search = BeamSearch(enc_dec)
            self.beam_search.compile()
        else:
            self.sampler = enc_dec.create_sampler(many_samples=True)

        self.idict_src = cPickle.load(open(self.state['indx_word'], 'r'))
Ejemplo n.º 3
0
def main():
    args = parse_args()

    state = prototype_phrase_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    server_address = ('', args.port)
    httpd = ThreadedHTTPServer(server_address, MTReqHandler)
    #httpd = BaseHTTPServer.HTTPServer(server_address, MTReqHandler)

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'], 'rb'))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'], 'r'))

    tokenizer_cmd = [os.getcwd() + '/tokenizer.perl', '-l', 'en', '-q', '-']
    detokenizer_cmd = [
        os.getcwd() + '/detokenizer.perl', '-l', 'fr', '-q', '-'
    ]
    sampler = Sampler(state,
                      lm_model,
                      indx_word,
                      idict_src,
                      beam_search=beam_search,
                      tokenizer_cmd=tokenizer_cmd,
                      detokenizer_cmd=detokenizer_cmd)
    httpd.sampler = sampler

    print 'Server starting..'
    httpd.serve_forever()
    '''
Ejemplo n.º 4
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(
        level=getattr(logging, state["level"]), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s"
    )

    server_address = ("", args.port)
    httpd = BaseHTTPServer.HTTPServer(server_address, MTReqHandler)

    rng = numpy.random.RandomState(state["seed"])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state["word_indx"], "rb"))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state["indx_word"], "r"))

    tokenizer_cmd = [os.getcwd() + "/tokenizer.perl", "-l", "en", "-q", "-"]
    detokenizer_cmd = [os.getcwd() + "/detokenizer.perl", "-l", "fr", "-q", "-"]
    sampler = Sampler(
        state,
        lm_model,
        indx_word,
        idict_src,
        beam_search=beam_search,
        tokenizer_cmd=tokenizer_cmd,
        detokenizer_cmd=detokenizer_cmd,
    )
    httpd.sampler = sampler

    print "Server starting.."
    httpd.serve_forever()

    """
Ejemplo n.º 5
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'],'rb'))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'],'r'))

    if args.source and args.trans:
        # Actually only beam search is currently supported here
        assert beam_search
        assert args.beam_size

        fsrc = open(args.source, 'r')
        ftrans = open(args.trans, 'w')

        start_time = time.time()

        n_samples = args.beam_size
        total_cost = 0.0
        logging.debug("Beam size: {}".format(n_samples))
        for i, line in enumerate(fsrc):
            seqin = line.strip()
            seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
            if args.verbose:
                print "Parsed Input:", parsed_in
            trans, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler,
                    beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
            try:
                best = numpy.argmin(costs)
                print >>ftrans, trans[best]
                total_cost += costs[best]
            except:
                print >> ftrans, "FAIL"
            if args.verbose:
                print "Translation:", trans[best]
            if (i + 1)  % 100 == 0:
                ftrans.flush()
                logger.debug("Current speed is {} per sentence".
                        format((time.time() - start_time) / (i + 1)))
        print "Total cost of the translations: {}".format(total_cost)

        fsrc.close()
        ftrans.close()
    else:
        while True:
            try:
                seqin = raw_input('Input Sequence: ')
                n_samples = int(raw_input('How many samples? '))
                alpha = None
                if not args.beam_search:
                    alpha = float(raw_input('Inverse Temperature? '))
                seq,parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
                print "Parsed Input:", parsed_in
            except Exception:
                print "Exception while parsing your input:"
                traceback.print_exc()
                continue

            sample(lm_model, seq, n_samples, sampler=sampler,
                    beam_search=beam_search,
                    ignore_unk=args.ignore_unk, normalize=args.normalize,
                    alpha=alpha, verbose=True)
Ejemplo n.º 6
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])

    ###########################################################
    # by He Wei
    #enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec = RNNEncoderDecoder(state,
                                rng,
                                skip_init=True,
                                compute_alignment=True)
    ###########################################################

    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'], 'rb'))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'], 'r'))

    if args.source and args.trans:
        # Actually only beam search is currently supported here
        #assert beam_search
        #assert args.beam_size

        fsrc = open(args.source, 'r')
        ftrans = open(args.trans, 'w')

        start_time = time.time()

        #n_samples = args.beam_size
        total_cost = 0.0
        #logging.debug("Beam size: {}".format(n_samples))
        for i, line in enumerate(fsrc):
            seqin = line.strip()
            seq, parsed_in = parse_input(state,
                                         indx_word,
                                         seqin,
                                         idx2word=idict_src)
            if args.verbose:
                print "Parsed Input:", parsed_in

            if args.beam_search:
                trans, costs, _, aligns = sample(lm_model,
                                                 seq,
                                                 args.beam_size,
                                                 sampler=sampler,
                                                 beam_search=beam_search,
                                                 ignore_unk=args.ignore_unk,
                                                 normalize=args.normalize)
            else:
                trans, costs, _, aligns = sample(lm_model,
                                                 seq,
                                                 1,
                                                 sampler=sampler,
                                                 beam_search=beam_search,
                                                 ignore_unk=args.ignore_unk,
                                                 normalize=args.normalize)
            best = numpy.argmin(costs)
            out_str = trans[best]
            align_str = []

            if args.beam_search and args.alignment:
                for (idx, _a) in enumerate(aligns[best]):
                    align_str.append("[%s]" % ' '.join(map(str, _a)))
                    #align_str.append("[%d-%d:%f,%d-%d:%f]" % (idx, _a[0], _a[1], idx, _a[2], _a[3]))
                out_str += "\t" + ' '.join(align_str)

            if args.beam_search and args.nbest:
                nbest_trans = trans
                nbest_costs = costs
                nbest_trans = numpy.array(nbest_trans)[numpy.argsort(
                    nbest_costs)]
                nbest_costs = numpy.array(sorted(nbest_costs))
                nbest_str = ' ||| '.join(
                    "%s | %f" % (t, c)
                    for (t, c) in zip(nbest_trans, nbest_costs))
                out_str += "\t" + nbest_str

            print >> ftrans, out_str

            if args.verbose:
                print "[Translation]%s\t[Align]%s" % (trans[best],
                                                      ' '.join(align_str))
            total_cost += costs[best]
            if (i + 1) % 100 == 0:
                ftrans.flush()
                logger.debug("Current speed is {} per sentence".format(
                    (time.time() - start_time) / (i + 1)))
        print "Total cost of the translations: {}".format(total_cost)
        print "Total used time: {}".format(time.time() - start_time)

        fsrc.close()
        ftrans.close()
    else:
        while True:
            try:
                seqin = raw_input('Input Sequence: ')
                n_samples = int(raw_input('How many samples? '))
                alpha = None
                if not args.beam_search:
                    alpha = float(raw_input('Inverse Temperature? '))
                seq, parsed_in = parse_input(state,
                                             indx_word,
                                             seqin,
                                             idx2word=idict_src)
                print "Parsed Input:", parsed_in
            except Exception:
                print "Exception while parsing your input:"
                traceback.print_exc()
                continue

            sample(lm_model,
                   seq,
                   n_samples,
                   sampler=sampler,
                   beam_search=beam_search,
                   ignore_unk=args.ignore_unk,
                   normalize=args.normalize,
                   alpha=alpha,
                   verbose=True)
Ejemplo n.º 7
0
def main():
    args = parse_args()

    state = prototype_search_with_coverage_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state,
                                rng,
                                skip_init=True,
                                compute_alignment=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'], 'rb'))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'], 'r'))

    if args.source and args.trans:
        # Actually only beam search is currently supported here
        assert beam_search
        assert args.beam_size

        fsrc = open(args.source, 'r')
        ftrans = open(args.trans, 'w')

        start_time = time.time()

        n_samples = args.beam_size
        total_cost = 0.0
        logging.debug("Beam size: {}".format(n_samples))
        for i, line in enumerate(fsrc):
            seqin = line.strip()
            seq, parsed_in = parse_input(state,
                                         indx_word,
                                         seqin,
                                         idx2word=idict_src)
            if lm_model.maintain_coverage:
                if lm_model.use_linguistic_coverage and lm_model.use_fertility_model:
                    trans, aligns, costs, coverages, fertility, _ = sample(
                        lm_model,
                        seq,
                        n_samples,
                        sampler=sampler,
                        beam_search=beam_search,
                        ignore_unk=args.ignore_unk,
                        normalize=args.normalize)
                else:
                    trans, aligns, costs, coverages, _ = sample(
                        lm_model,
                        seq,
                        n_samples,
                        sampler=sampler,
                        beam_search=beam_search,
                        ignore_unk=args.ignore_unk,
                        normalize=args.normalize)
            else:
                trans, aligns, costs, _ = sample(lm_model,
                                                 seq,
                                                 n_samples,
                                                 sampler=sampler,
                                                 beam_search=beam_search,
                                                 ignore_unk=args.ignore_unk,
                                                 normalize=args.normalize)

            if args.verbose:
                print "Parsed Input:", parsed_in

            if len(trans) == 0:
                trans = ['Failed']
                costs = [0.0]

            best = numpy.argmin(costs)
            print >> ftrans, trans[best]
            if args.verbose:
                print "Translation:", trans[best]
                print "Aligns:"
                # aligns shape:  (target_len, source_len)
                # we reverse it to the shape (source_len, target_len) to show the matrix
                print numpy.array(aligns[best]).transpose().tolist()

                if lm_model.maintain_coverage:
                    # since we filtered <eos> from trans[best], thus the index adds 1
                    coverage = coverages[best]
                    print "Coverage:",
                    words = parsed_in.split()
                    for k in xrange(len(words)):
                        print '%s/%.2f' % (words[k], coverage[k]),
                    print ''
                    if lm_model.use_linguistic_coverage and lm_model.use_fertility_model:
                        print 'Fertility:  ',
                        for k in xrange(len(words)):
                            print '%s/%.2f' % (words[k], fertility[k]),
                        print ''
                print

            total_cost += costs[best]
            if (i + 1) % 100 == 0:
                ftrans.flush()
                logger.debug("Current speed is {} per sentence".format(
                    (time.time() - start_time) / (i + 1)))
        print "Total cost of the translations: {}".format(total_cost)

        fsrc.close()
        ftrans.close()
    else:
        while True:
            try:
                seqin = raw_input('Input Sequence: ')
                n_samples = int(raw_input('How many samples? '))
                alpha = None
                if not args.beam_search:
                    alpha = float(raw_input('Inverse Temperature? '))
                seq, parsed_in = parse_input(state,
                                             indx_word,
                                             seqin,
                                             idx2word=idict_src)
                print "Parsed Input:", parsed_in
            except Exception:
                print "Exception while parsing your input:"
                traceback.print_exc()
                continue

            sample(lm_model,
                   seq,
                   n_samples,
                   sampler=sampler,
                   beam_search=beam_search,
                   ignore_unk=args.ignore_unk,
                   normalize=args.normalize,
                   alpha=alpha,
                   verbose=True)
Ejemplo n.º 8
0
def main():
    args = parse_args()

    state = prototype_search_with_coverage_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True, compute_alignment=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'],'rb'))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'],'r'))

    if args.source and args.trans:
        # Actually only beam search is currently supported here
        assert beam_search
        assert args.beam_size

        fsrc = open(args.source, 'r')
        ftrans = open(args.trans, 'w')

        start_time = time.time()

        n_samples = args.beam_size
        total_cost = 0.0
        logging.debug("Beam size: {}".format(n_samples))
        for i, line in enumerate(fsrc):
            seqin = line.strip()
            seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
            if lm_model.maintain_coverage:
                if lm_model.use_linguistic_coverage and lm_model.use_fertility_model:
                    trans, aligns, costs, coverages, fertility, _ = sample(lm_model, seq, n_samples, sampler=sampler,
                            beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
                else:
                    trans, aligns, costs, coverages, _ = sample(lm_model, seq, n_samples, sampler=sampler,
                            beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
            else:
                trans, aligns, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler,
                        beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
            
            if args.verbose:
                print "Parsed Input:", parsed_in

            if len(trans) == 0:
                trans = ['Failed']
                costs = [0.0]

            best = numpy.argmin(costs)
            print >>ftrans, trans[best]
            if args.verbose:
                print "Translation:", trans[best]
                print "Aligns:"
                # aligns shape:  (target_len, source_len)
                # we reverse it to the shape (source_len, target_len) to show the matrix
                print numpy.array(aligns[best]).transpose().tolist()

                if lm_model.maintain_coverage:
                    # since we filtered <eos> from trans[best], thus the index adds 1
                    coverage = coverages[best]
                    print "Coverage:", 
                    words = parsed_in.split()
                    for k in xrange(len(words)):
                        print '%s/%.2f'%(words[k], coverage[k]),
                    print ''
                    if lm_model.use_linguistic_coverage and lm_model.use_fertility_model:
                        print 'Fertility:  ',
                        for k in xrange(len(words)):
                            print '%s/%.2f'%(words[k], fertility[k]),
                        print ''
                print 

            total_cost += costs[best]
            if (i + 1)  % 100 == 0:
                ftrans.flush()
                logger.debug("Current speed is {} per sentence".
                        format((time.time() - start_time) / (i + 1)))
        print "Total cost of the translations: {}".format(total_cost)

        fsrc.close()
        ftrans.close()
    else:
        while True:
            try:
                seqin = raw_input('Input Sequence: ')
                n_samples = int(raw_input('How many samples? '))
                alpha = None
                if not args.beam_search:
                    alpha = float(raw_input('Inverse Temperature? '))
                seq,parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
                print "Parsed Input:", parsed_in
            except Exception:
                print "Exception while parsing your input:"
                traceback.print_exc()
                continue

            sample(lm_model, seq, n_samples, sampler=sampler,
                    beam_search=beam_search,
                    ignore_unk=args.ignore_unk, normalize=args.normalize,
                    alpha=alpha, verbose=True)
Ejemplo n.º 9
0
def main():
    args = parse_args()

    state = prototype_state()
    with open(args.state) as src:
        state.update(cPickle.load(src))
    state.update(eval("dict({})".format(args.changes)))

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    rng = numpy.random.RandomState(state['seed'])
    enc_dec = RNNEncoderDecoder(state, rng, skip_init=True)
    enc_dec.build()
    lm_model = enc_dec.create_lm_model()
    lm_model.load(args.model_path)
    indx_word = cPickle.load(open(state['word_indx'],'rb'))

    sampler = None
    beam_search = None
    if args.beam_search:
        beam_search = BeamSearch(enc_dec)
        beam_search.compile()
    else:
        sampler = enc_dec.create_sampler(many_samples=True)

    idict_src = cPickle.load(open(state['indx_word'],'r'))

    if args.source and args.trans:
        # Actually only beam search is currently supported here
        assert beam_search
        assert args.beam_size

        fsrc = open(args.source, 'r')
        ftrans = open(args.trans, 'w')

        start_time = time.time()

        n_samples = args.beam_size
        total_cost = 0.0
        logging.debug("Beam size: {}".format(n_samples))
        for i, line in enumerate(fsrc):
            seqin = line.strip()
            seq, parsed_in = parse_input(state, indx_word, seqin, idx2word=idict_src)
            if lm_model.maintain_coverage:
                trans, costs, coverages, _ = sample(lm_model, seq, n_samples, sampler=sampler,
                        beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
            else:
                trans, costs, _ = sample(lm_model, seq, n_samples, sampler=sampler,
                        beam_search=beam_search, ignore_unk=args.ignore_unk, normalize=args.normalize)
            
            if args.verbose:
                print "Parsed Input:", parsed_in

            if len(trans) == 0:
                trans = ['Failed']
                costs = [0.0]

            best = numpy.argmin(costs)
            print >>ftrans, trans[best]
            if args.verbose:
                print "Translation:", trans[best]

                if lm_model.maintain_coverage: