def __init__(self, context: mx.context.Context, inputs: str, references: str, model: str, max_input_len: int, beam_size=C.DEFAULT_BEAM_SIZE, limit: int = -1): self.context = context self.max_input_len = max_input_len self.beam_size = beam_size self.model = model with smart_open(inputs) as inputs_fin, smart_open(references) as references_fin: input_sentences = inputs_fin.readlines() target_sentences = references_fin.readlines() assert len(input_sentences) == len(target_sentences), "Number of sentence pairs do not match" if limit <= 0: limit = len(input_sentences) if limit < len(input_sentences): self.input_sentences, self.target_sentences = zip( *random.sample(list(zip(input_sentences, target_sentences)), limit)) else: self.input_sentences, self.target_sentences = input_sentences, target_sentences logger.info("Created CheckpointDecoder(max_input_len=%d, beam_size=%d, model=%s, num_sentences=%d)", max_input_len, beam_size, model, len(self.input_sentences)) with smart_open(os.path.join(self.model, C.DECODE_REF_NAME), 'w') as trg_out, \ smart_open(os.path.join(self.model, C.DECODE_IN_NAME), 'w') as src_out: [trg_out.write(s) for s in self.target_sentences] [src_out.write(s) for s in self.input_sentences]
def __init__(self, context: mx.context.Context, inputs: str, references: str, model: str, max_input_len: int, beam_size: int = C.DEFAULT_BEAM_SIZE, bucket_width_source: int = 10, bucket_width_target: int = 10, length_penalty_alpha: float = 1.0, length_penalty_beta: float = 0.0, softmax_temperature: Optional[float] = None, max_output_length_num_stds: int = C.DEFAULT_NUM_STD_MAX_OUTPUT_LENGTH, ensemble_mode: str = 'linear', sample_size: int = -1, random_seed: int = 42) -> None: self.context = context self.max_input_len = max_input_len self.max_output_length_num_stds = max_output_length_num_stds self.ensemble_mode = ensemble_mode self.beam_size = beam_size self.bucket_width_source = bucket_width_source self.bucket_width_target = bucket_width_target self.length_penalty_alpha = length_penalty_alpha self.length_penalty_beta = length_penalty_beta self.softmax_temperature = softmax_temperature self.model = model with smart_open(inputs) as inputs_fin, smart_open(references) as references_fin: input_sentences = inputs_fin.readlines() target_sentences = references_fin.readlines() check_condition(len(input_sentences) == len(target_sentences), "Number of sentence pairs do not match") if sample_size <= 0: sample_size = len(input_sentences) if sample_size < len(input_sentences): # custom random number generator to guarantee the same samples across runs in order to be able to # compare metrics across independent runs random_gen = random.Random(random_seed) self.input_sentences, self.target_sentences = zip( *random_gen.sample(list(zip(input_sentences, target_sentences)), sample_size)) else: self.input_sentences, self.target_sentences = input_sentences, target_sentences logger.info("Created CheckpointDecoder(max_input_len=%d, beam_size=%d, model=%s, num_sentences=%d)", max_input_len, beam_size, model, len(self.input_sentences)) with smart_open(os.path.join(self.model, C.DECODE_REF_NAME), 'w') as trg_out, \ smart_open(os.path.join(self.model, C.DECODE_IN_NAME), 'w') as src_out: [trg_out.write(s) for s in self.target_sentences] [src_out.write(s) for s in self.input_sentences]
def decode_and_evaluate(self, checkpoint: int) -> Dict[str, float]: """ Decodes data set and evaluates given a checkpoint. :param checkpoint: Checkpoint to load parameters from. :return: Mapping of metric names to scores. """ translator = sockeye.inference.Translator(self.context, 'linear', *sockeye.inference.load_models(self.context, self.max_input_len, self.beam_size, [self.model], [checkpoint])) output_name = os.path.join(self.model, C.DECODE_OUT_NAME % checkpoint) with smart_open(output_name, 'w') as output: handler = sockeye.output_handler.StringOutputHandler(output) translations = [] for sent_id, input_sentence in enumerate(self.input_sentences): trans_input = translator.make_input(sent_id, input_sentence) trans_output = translator.translate(trans_input) handler.handle(trans_input, trans_output) translations.append(trans_output.translation) logger.info("Checkpoint [%d] %d translations saved to '%s'", checkpoint, len(translations), output_name) # TODO(fhieber): eventually add more metrics (METEOR etc.) return {"bleu-val": sockeye.bleu.corpus_bleu(translations, self.target_sentences)}
def read_lexicon(path: str, vocab_source: Dict[str, int], vocab_target: Dict[str, int]) -> np.ndarray: """ Loads lexical translation probabilities from a translation table of format: src, trg, logprob. Source words unknown to vocab_source are discarded. Target words unknown to vocab_target contribute to p(unk|source_word). See Incorporating Discrete Translation Lexicons into Neural Machine Translation, Section 3.1 & Equation 5 (https://arxiv.org/pdf/1606.02006.pdf)) :param path: Path to lexicon file. :param vocab_source: Source vocabulary. :param vocab_target: Target vocabulary. :return: Lexicon array. Shape: (vocab_source_size, vocab_target_size). """ assert C.UNK_SYMBOL in vocab_source assert C.UNK_SYMBOL in vocab_target src_unk_id = vocab_source[C.UNK_SYMBOL] trg_unk_id = vocab_target[C.UNK_SYMBOL] lexicon = np.zeros((len(vocab_source), len(vocab_target))) n = 0 with smart_open(path) as fin: for line in fin: src, trg, logprob = line.rstrip('\n').split("\t") prob = np.exp(float(logprob)) src_id = vocab_source.get(src, src_unk_id) trg_id = vocab_target.get(trg, trg_unk_id) if src_id == src_unk_id: continue if trg_id == trg_unk_id: lexicon[src_id, trg_unk_id] += prob else: lexicon[src_id, trg_id] = prob n += 1 logger.info("Loaded lexicon from '%s' with %d entries", path, n) return lexicon
def make_inputs( input_file: Optional[str], translator: inference.Translator, input_is_json: bool, input_factors: Optional[List[str]] = None ) -> Generator[inference.TranslatorInput, None, None]: """ Generates TranslatorInput instances from input. If input is None, reads from stdin. If num_input_factors > 1, the function will look for factors attached to each token, separated by '|'. If source is not None, reads from the source file. If num_source_factors > 1, num_source_factors source factor filenames are required. :param input_file: The source file (possibly None). :param translator: Translator that will translate each line of input. :param input_is_json: Whether the input is in json format. :param input_factors: Source factor files. :return: TranslatorInput objects. """ if input_file is None: check_condition( input_factors is None, "Translating from STDIN, not expecting any factor files.") for sentence_id, line in enumerate(sys.stdin, 1): #GRN surface, graph = line.split('\t') if input_is_json: yield inference.make_input_from_json_string( sentence_id=sentence_id, json_string=line) else: yield inference.make_input_from_factored_string( sentence_id=sentence_id, factored_string=surface, graph=graph, translator=translator) else: input_factors = [] if input_factors is None else input_factors inputs = [input_file] + input_factors check_condition( translator.num_source_factors == len(inputs), "Model(s) require %d factors, but %d given (through --input and --input-factors)." % (translator.num_source_factors, len(inputs))) with ExitStack() as exit_stack: streams = [ exit_stack.enter_context(data_io.smart_open(i)) for i in inputs ] for sentence_id, inputs in enumerate(zip(*streams), 1): if input_is_json: yield inference.make_input_from_json_string( sentence_id=sentence_id, json_string=inputs[0]) else: yield inference.make_input_from_multiple_strings( sentence_id=sentence_id, strings=list(inputs))
def __init__(self, context: mx.context.Context, inputs: str, references: str, model: str, max_input_len: int, beam_size=C.DEFAULT_BEAM_SIZE, limit: int = -1) -> None: self.context = context self.max_input_len = max_input_len self.beam_size = beam_size self.model = model with smart_open(inputs) as inputs_fin, smart_open( references) as references_fin: input_sentences = inputs_fin.readlines() target_sentences = references_fin.readlines() check_condition( len(input_sentences) == len(target_sentences), "Number of sentence pairs do not match") if limit <= 0: limit = len(input_sentences) if limit < len(input_sentences): # custom random number generator to guarantee the same samples across runs in order to be able to # compare metrics across independent runs random_gen = random.Random(42) self.input_sentences, self.target_sentences = zip( *random_gen.sample( list(zip(input_sentences, target_sentences)), limit)) else: self.input_sentences, self.target_sentences = input_sentences, target_sentences logger.info( "Created CheckpointDecoder(max_input_len=%d, beam_size=%d, model=%s, num_sentences=%d)", max_input_len, beam_size, model, len(self.input_sentences)) with smart_open(os.path.join(self.model, C.DECODE_REF_NAME), 'w') as trg_out, \ smart_open(os.path.join(self.model, C.DECODE_IN_NAME), 'w') as src_out: [trg_out.write(s) for s in self.target_sentences] [src_out.write(s) for s in self.input_sentences]
def build_from_paths(paths: List[str], num_words: int = 50000, min_count: int = 1) -> Dict[str, int]: """ Creates vocabulary from paths to a file in sentence-per-line format. A sentence is just a whitespace delimited list of tokens. Note that special symbols like the beginning of sentence (BOS) symbol will be added to the vocabulary. :param paths: List of paths to files with one sentence per line. :param num_words: Maximum number of words in the vocabulary. :param min_count: Minimum occurrences of words to be included in the vocabulary. :return: Word-to-id mapping. """ with ExitStack() as stack: logger.info("Building vocabulary from dataset(s): %s", paths) files = (stack.enter_context(smart_open(path)) for path in paths) return build_vocab(chain(*files), num_words, min_count)
def build_from_path(path: str, num_words: int = 50000, min_count: int = 1) -> Dict[str, int]: """ Creates vocabulary from path to a file in sentence-per-line format. A sentence is just a whitespace delimited list of tokens. Note that special symbols like the beginning of sentence (BOS) symbol will be added to the vocabulary. :param path: Path to file with one sentence per line. :param num_words: Maximum number of words in the vocabulary. :param min_count: Minimum occurrences of words to be included in the vocabulary. :return: Word-to-id mapping. """ with smart_open(path) as data: logger.info("Building vocabulary from dataset: %s", path) return build_vocab(data, num_words, min_count)
def decode_and_evaluate(self, checkpoint: Optional[int] = None, output_name: str = os.devnull, speed_percentile: int = 99) -> Dict[str, float]: """ Decodes data set and evaluates given a checkpoint. :param checkpoint: Checkpoint to load parameters from. :param output_name: Filename to write translations to. Defaults to /dev/null. :param speed_percentile: Percentile to compute for sec/sent. Default: p99. :return: Mapping of metric names to scores. """ models, vocab_source, vocab_target = load_models(self.context, self.max_input_len, self.beam_size, [self.model], [checkpoint], softmax_temperature=self.softmax_temperature, max_output_length_num_stds=self.max_output_length_num_stds) translator = Translator(self.context, self.ensemble_mode, self.bucket_width_source, self.bucket_width_target, LengthPenalty(self.length_penalty_alpha, self.length_penalty_beta), models, vocab_source, vocab_target) trans_wall_times = np.zeros((len(self.input_sentences),)) with smart_open(output_name, 'w') as output: handler = sockeye.output_handler.StringOutputHandler(output) translations = [] for i, input_sentence in enumerate(self.input_sentences): tic = time.time() trans_input = translator.make_input(i, input_sentence) trans_output = translator.translate(trans_input) handler.handle(trans_input, trans_output) trans_wall_time = time.time() - tic trans_wall_times[i] = trans_wall_time translations.append(trans_output.translation) percentile_sec_per_sent = np.percentile(trans_wall_times, speed_percentile) # TODO(fhieber): eventually add more metrics (METEOR etc.) return {C.BLEU_VAL: sockeye.bleu.corpus_bleu(translations, self.target_sentences), C.SPEED_PCT % speed_percentile: percentile_sec_per_sent}