Ejemplo n.º 1
0
    def __init__(self, fwd_params, fwd_err, rev_params, rev_err):

        cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
        fast_align = os.path.join(cdec_root, 'word-aligner', 'fast_align')
        atools = os.path.join(cdec_root, 'utils', 'atools')

        (fwd_T, fwd_m) = self.read_err(fwd_err)
        (rev_T, rev_m) = self.read_err(rev_err)

        fwd_cmd = [
            fast_align, '-i', '-', '-d', '-T', fwd_T, '-m', fwd_m, '-f',
            fwd_params
        ]
        rev_cmd = [
            fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f',
            rev_params, '-r'
        ]
        tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and']

        logging.info('Executing: {}'.format(' '.join(fwd_cmd)))
        self.fwd_align = util.popen_io(fwd_cmd)

        logging.info('Executing: {}'.format(' '.join(rev_cmd)))
        self.rev_align = util.popen_io(rev_cmd)

        logging.info('Executing: {}'.format(' '.join(tools_cmd)))
        self.tools = util.popen_io(tools_cmd)
Ejemplo n.º 2
0
Archivo: rt.py Proyecto: pauldb89/cdec
    def __init__(self, configdir, tmpdir="/tmp", cache_size=5, norm=False):

        # name -> (method, set of possible nargs)
        self.COMMANDS = {
            "TR": (self.translate, set((1,))),
            "LEARN": (self.learn, set((2,))),
            "SAVE": (self.save_state, set((0, 1))),
            "LOAD": (self.load_state, set((0, 1))),
            "DROP": (self.drop_ctx, set((0,))),
            "LIST": (self.list_ctx, set((0,))),
        }

        cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

        ### Single instance for all contexts

        self.config = configdir
        # Temporary work dir
        self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix="realtime.")
        logger.info("Using temp dir {}".format(self.tmp))

        # Normalization
        self.norm = norm
        if self.norm:
            self.tokenizer = util.popen_io([os.path.join(cdec_root, "corpus", "tokenize-anything.sh"), "-u"])
            self.tokenizer_lock = util.FIFOLock()
            self.detokenizer = util.popen_io([os.path.join(cdec_root, "corpus", "untok.pl")])
            self.detokenizer_lock = util.FIFOLock()

        # Word aligner
        fwd_params = os.path.join(configdir, "a.fwd_params")
        fwd_err = os.path.join(configdir, "a.fwd_err")
        rev_params = os.path.join(configdir, "a.rev_params")
        rev_err = os.path.join(configdir, "a.rev_err")
        self.aligner = aligner.ForceAligner(fwd_params, fwd_err, rev_params, rev_err)

        # Grammar extractor
        sa_config = cdec.configobj.ConfigObj(os.path.join(configdir, "sa.ini"), unrepr=True)
        sa_config.filename = os.path.join(self.tmp, "sa.ini")
        util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir))
        sa_config.write()
        self.extractor = cdec.sa.GrammarExtractor(sa_config.filename, online=True)
        self.cache_size = cache_size

        ### One instance per context

        self.ctx_names = set()
        # All context-dependent operations are atomic
        self.ctx_locks = collections.defaultdict(util.FIFOLock)
        # ctx -> list of (source, target, alignment)
        self.ctx_data = {}

        # Grammar extractor is not threadsafe
        self.extractor_lock = util.FIFOLock()
        # ctx -> deque of file
        self.grammar_files = {}
        # ctx -> dict of {sentence: file}
        self.grammar_dict = {}

        self.decoders = {}
Ejemplo n.º 3
0
    def __init__(self, configdir, tmpdir='/tmp', cache_size=5, norm=False):

        cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))

        # Temporary work dir
        self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix='realtime.')
        logging.info('Using temp dir {}'.format(self.tmp))

        # Normalization
        self.norm = norm
        if self.norm:
            self.tokenizer = util.popen_io([
                os.path.join(cdec_root, 'corpus', 'tokenize-anything.sh'), '-u'
            ])
            self.detokenizer = util.popen_io(
                [os.path.join(cdec_root, 'corpus', 'untok.pl')])

        # Word aligner
        fwd_params = os.path.join(configdir, 'a.fwd_params')
        fwd_err = os.path.join(configdir, 'a.fwd_err')
        rev_params = os.path.join(configdir, 'a.rev_params')
        rev_err = os.path.join(configdir, 'a.rev_err')
        self.aligner = aligner.ForceAligner(fwd_params, fwd_err, rev_params,
                                            rev_err)

        # Grammar extractor
        sa_config = ConfigObj(os.path.join(configdir, 'sa.ini'), unrepr=True)
        sa_config.filename = os.path.join(self.tmp, 'sa.ini')
        util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir))
        sa_config.write()
        self.extractor = cdec.sa.GrammarExtractor(sa_config.filename,
                                                  online=True)
        self.grammar_files = collections.deque()
        self.grammar_dict = {}
        self.cache_size = cache_size

        # HPYPLM reference stream
        ref_fifo_file = os.path.join(self.tmp, 'ref.fifo')
        os.mkfifo(ref_fifo_file)
        self.ref_fifo = open(ref_fifo_file, 'w+')
        # Start with empty line (do not learn prior to first input)
        self.ref_fifo.write('\n')
        self.ref_fifo.flush()

        # Decoder
        decoder_config = [[f.strip() for f in line.split('=')]
                          for line in open(os.path.join(configdir, 'cdec.ini'))
                          ]
        util.cdec_ini_for_realtime(decoder_config, os.path.abspath(configdir),
                                   ref_fifo_file)
        decoder_config_file = os.path.join(self.tmp, 'cdec.ini')
        with open(decoder_config_file, 'w') as output:
            for (k, v) in decoder_config:
                output.write('{}={}\n'.format(k, v))
        decoder_weights = os.path.join(configdir, 'weights.final')
        self.decoder = decoder.MIRADecoder(decoder_config_file,
                                           decoder_weights)
Ejemplo n.º 4
0
 def __init__(self, config, weights):
     cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
     decoder = os.path.join(cdec_root, 'decoder', 'cdec')
     decoder_cmd = [decoder, '-c', config, '-w', weights]
     logger.info('Executing: {}'.format(' '.join(decoder_cmd)))
     self.decoder = util.popen_io(decoder_cmd)
     self.lock = util.FIFOLock()
Ejemplo n.º 5
0
 def __init__(self, config, weights):
     cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
     decoder = os.path.join(cdec_root, 'decoder', 'cdec')
     decoder_cmd = [decoder, '-c', config, '-w', weights]
     logger.info('Executing: {}'.format(' '.join(decoder_cmd)))
     self.decoder = util.popen_io(decoder_cmd)
     self.lock = util.FIFOLock()
Ejemplo n.º 6
0
 def __init__(self, config, weights, metric="ibm_bleu"):
     cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
     mira = os.path.join(cdec_root, "training", "mira", "kbest_cut_mira")
     #                                              optimizer=2 step=0.001    best=500,    k=500,       uniq, stream, metric
     mira_cmd = [
         mira,
         "-c",
         config,
         "-w",
         weights,
         "-o",
         "2",
         "-C",
         "0.001",
         "-b",
         "500",
         "-k",
         "500",
         "-u",
         "-t",
         "-m",
         metric,
     ]
     logger.info("Executing: {}".format(" ".join(mira_cmd)))
     self.decoder = util.popen_io(mira_cmd)
     self.lock = util.FIFOLock()
Ejemplo n.º 7
0
 def __init__(self, config, weights):
     cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
     decoder = os.path.join(cdec_root, "decoder", "cdec")
     decoder_cmd = [decoder, "-c", config, "-w", weights]
     logger.info("Executing: {}".format(" ".join(decoder_cmd)))
     self.decoder = util.popen_io(decoder_cmd)
     self.lock = util.FIFOLock()
Ejemplo n.º 8
0
 def __init__(self, config, weights, metric='ibm_bleu'):
     cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
     mira = os.path.join(cdec_root, 'training', 'mira', 'kbest_cut_mira')
     #                                              optimizer=2 step=0.001    best=500,    k=500,       uniq, stream, metric
     mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t', '-m', metric]
     logger.info('Executing: {}'.format(' '.join(mira_cmd)))
     self.decoder = util.popen_io(mira_cmd)
     self.lock = util.FIFOLock()
Ejemplo n.º 9
0
 def __init__(self, config, weights):
     cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
     mira = os.path.join(cdec_root, 'training', 'mira', 'kbest_cut_mira')
     #                                              optimizer=2 step=0.001    best=500,    k=500,       uniq, stream
     mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t']
     logger.info('Executing: {}'.format(' '.join(mira_cmd)))
     self.decoder = util.popen_io(mira_cmd)
     self.lock = util.FIFOLock()
Ejemplo n.º 10
0
 def __init__(self, config, weights):
     cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
     mira = os.path.join(cdec_root, 'training', 'mira', 'kbest_cut_mira')
     #                                              optimizer=2 step=0.001    best=500,    k=500,       uniq, stream
     mira_cmd = [
         mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b',
         '500', '-k', '500', '-u', '-t'
     ]
     logging.info('Executing: {}'.format(' '.join(mira_cmd)))
     self.decoder = util.popen_io(mira_cmd)
Ejemplo n.º 11
0
    def __init__(self,
                 fwd_params,
                 fwd_err,
                 rev_params,
                 rev_err,
                 heuristic='grow-diag-final-and'):

        cdec_root = os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        fast_align = os.path.join(cdec_root, 'word-aligner', 'fast_align')
        atools = os.path.join(cdec_root, 'utils', 'atools')

        (fwd_T, fwd_m) = self.read_err(fwd_err)
        (rev_T, rev_m) = self.read_err(rev_err)

        fwd_cmd = [
            fast_align, '-i', '-', '-d', '-T', fwd_T, '-m', fwd_m, '-f',
            fwd_params
        ]
        rev_cmd = [
            fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f',
            rev_params, '-r'
        ]
        tools_cmd = [atools, '-i', '-', '-j', '-', '-c', heuristic]

        logger.info('Executing: {}'.format(' '.join(fwd_cmd)))
        self.fwd_align = util.popen_io(fwd_cmd)

        logger.info('Executing: {}'.format(' '.join(rev_cmd)))
        self.rev_align = util.popen_io(rev_cmd)

        logger.info('Executing: {}'.format(' '.join(tools_cmd)))
        self.tools = util.popen_io(tools_cmd)

        # Used to guarantee thread safety
        self.lock = util.FIFOLock()
Ejemplo n.º 12
0
    def __init__(self, fwd_params, fwd_err, rev_params, rev_err):

        cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        fast_align = os.path.join(cdec_root, 'word-aligner', 'fast_align')
        atools = os.path.join(cdec_root, 'utils', 'atools')

        (fwd_T, fwd_m) = self.read_err(fwd_err)
        (rev_T, rev_m) = self.read_err(rev_err)

        fwd_cmd = [fast_align, '-i', '-', '-d', '-T', fwd_T, '-m', fwd_m, '-f', fwd_params]
        rev_cmd = [fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f', rev_params, '-r']
        tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and']

        logger.info('Executing: {}'.format(' '.join(fwd_cmd)))
        self.fwd_align = util.popen_io(fwd_cmd)

        logger.info('Executing: {}'.format(' '.join(rev_cmd)))
        self.rev_align = util.popen_io(rev_cmd)

        logger.info('Executing: {}'.format(' '.join(tools_cmd)))
        self.tools = util.popen_io(tools_cmd)

        # Used to guarantee thread safety
        self.lock = util.FIFOLock()
Ejemplo n.º 13
0
    def __init__(self, configdir, tmpdir='/tmp', cache_size=5, norm=False):

        # name -> (method, set of possible nargs)
        self.COMMANDS = {
                'TR': (self.translate, set((1,))),
                'LEARN': (self.learn, set((2,))),
                'SAVE': (self.save_state, set((0, 1))),
                'LOAD': (self.load_state, set((0, 1))),
                'DROP': (self.drop_ctx, set((0,))),
                'LIST': (self.list_ctx, set((0,))),
                }
        
        cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

        # rt.ini options
        ini = dict(line.strip().split('=') for line in open(os.path.join(configdir, 'rt.ini')))
        self.hpyplm = (ini.get('hpyplm', 'false') in TRUE)
        self.metric = ini.get('metric', 'ibm_bleu')
        
        ### Single instance for all contexts

        self.config = configdir
        # Temporary work dir
        self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix='realtime.')
        logger.info('Using temp dir {}'.format(self.tmp))

        # Normalization
        self.norm = norm
        if self.norm:
            self.tokenizer = util.popen_io([os.path.join(cdec_root, 'corpus', 'tokenize-anything.sh'), '-u'])
            self.tokenizer_lock = util.FIFOLock()
            self.detokenizer = util.popen_io([os.path.join(cdec_root, 'corpus', 'untok.pl')])
            self.detokenizer_lock = util.FIFOLock()

        # Word aligner
        fwd_params = os.path.join(configdir, 'a.fwd_params')
        fwd_err = os.path.join(configdir, 'a.fwd_err')
        rev_params = os.path.join(configdir, 'a.rev_params')
        rev_err = os.path.join(configdir, 'a.rev_err')
        self.aligner = aligner.ForceAligner(fwd_params, fwd_err, rev_params, rev_err)

        # Grammar extractor
        sa_config = cdec.configobj.ConfigObj(os.path.join(configdir, 'sa.ini'), unrepr=True)
        sa_config.filename = os.path.join(self.tmp, 'sa.ini')
        util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir))
        sa_config.write()
        self.extractor = ExtractorWrapper(sa_config.filename)
        self.cache_size = cache_size

        ### One instance per context

        self.ctx_names = set()
        # All context-dependent operations are atomic
        self.ctx_locks = collections.defaultdict(util.FIFOLock)
        # ctx -> list of (source, target, alignment)
        self.ctx_data = {}

        # Grammar extractor is not threadsafe
        self.extractor_lock = util.FIFOLock()
        # ctx -> deque of file
        self.grammar_files = {}
        # ctx -> dict of {sentence: file}
        self.grammar_dict = {}

        self.decoders = {}
Ejemplo n.º 14
0
    def __init__(self, configdir, tmpdir='/tmp', cache_size=5, norm=False):

        # name -> (method, set of possible nargs)
        self.COMMANDS = {
            'TR': (self.translate, set((1, ))),
            'LEARN': (self.learn, set((2, ))),
            'SAVE': (self.save_state, set((0, 1))),
            'LOAD': (self.load_state, set((0, 1))),
            'DROP': (self.drop_ctx, set((0, ))),
            'LIST': (self.list_ctx, set((0, ))),
        }

        cdec_root = os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

        # rt.ini options
        ini = dict(line.strip().split('=')
                   for line in open(os.path.join(configdir, 'rt.ini')))
        self.hpyplm = (ini.get('hpyplm', 'false') in TRUE)
        self.metric = ini.get('metric', 'ibm_bleu')

        ### Single instance for all contexts

        self.config = configdir
        # Temporary work dir
        self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix='realtime.')
        logger.info('Using temp dir {}'.format(self.tmp))

        # Normalization
        self.norm = norm
        if self.norm:
            self.tokenizer = util.popen_io([
                os.path.join(cdec_root, 'corpus', 'tokenize-anything.sh'), '-u'
            ])
            self.tokenizer_lock = util.FIFOLock()
            self.detokenizer = util.popen_io(
                [os.path.join(cdec_root, 'corpus', 'untok.pl')])
            self.detokenizer_lock = util.FIFOLock()

        # Word aligner
        fwd_params = os.path.join(configdir, 'a.fwd_params')
        fwd_err = os.path.join(configdir, 'a.fwd_err')
        rev_params = os.path.join(configdir, 'a.rev_params')
        rev_err = os.path.join(configdir, 'a.rev_err')
        self.aligner = aligner.ForceAligner(fwd_params, fwd_err, rev_params,
                                            rev_err)

        # Grammar extractor
        sa_config = cdec.configobj.ConfigObj(os.path.join(configdir, 'sa.ini'),
                                             unrepr=True)
        sa_config.filename = os.path.join(self.tmp, 'sa.ini')
        util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir))
        sa_config.write()
        self.extractor = ExtractorWrapper(sa_config.filename)
        self.cache_size = cache_size

        ### One instance per context

        self.ctx_names = set()
        # All context-dependent operations are atomic
        self.ctx_locks = collections.defaultdict(util.FIFOLock)
        # ctx -> list of (source, target, alignment)
        self.ctx_data = {}

        # Grammar extractor is not threadsafe
        self.extractor_lock = util.FIFOLock()
        # ctx -> deque of file
        self.grammar_files = {}
        # ctx -> dict of {sentence: file}
        self.grammar_dict = {}

        self.decoders = {}