def score_line(self, line, vocabulary): """Scores a line of text. Start-of-sentence and end-of-sentece tags (``<s>`` and ``</s>``) will be inserted at the beginning and the end of the line, if they're missing. If the line is empty, ``None`` will be returned, instead of interpreting it as the empty sentence ``<s> </s>``. :type line: str :param line: a sequence of words :type vocabulary: Vocabulary :param vocabulary: vocabulary for converting the words to word IDs :rtype: float :returns: log probability of the word sequence, or None if the line is empty """ words = utterance_from_line(line) if not words: return None word_ids = vocabulary.words_to_ids(words) unk_id = vocabulary.word_to_id['<unk>'] self.num_words += word_ids.size self.num_unks += numpy.count_nonzero(word_ids == unk_id) class_ids = [ vocabulary.word_id_to_class_id[word_id] for word_id in word_ids ] probs = [vocabulary.get_word_prob(word_id) for word_id in word_ids] return self.score_sequence(word_ids, class_ids, probs)
def __init__(self, input_files, vocabulary=None, count_type='int32'): """Reads word statistics from corpus file and creates the ``unigram_counts`` and ``bigram_counts`` attributes. Leaves the input files pointing to the beginning of the file. :type input_files: list of file or mmap objects :param input_files: input text files :type vocabulary: Vocabulary :param vocabulary: restrict to these words """ vocabulary_size = vocabulary.num_words() unk_id = vocabulary.word_to_id['<unk>'] self.unigram_counts = numpy.zeros(vocabulary_size, count_type) self.bigram_counts = dok_matrix((vocabulary_size, vocabulary_size), dtype=count_type) for subset_file in input_files: for line in subset_file: sequence = [] for word in utterance_from_line(line): if word in vocabulary: sequence.append(vocabulary.word_to_id[word]) else: sequence.append(unk_id) for word_id in sequence: self.unigram_counts[word_id] += 1 for left_word_id, right_word_id in zip(sequence[:-1], sequence[1:]): self.bigram_counts[left_word_id, right_word_id] += 1 subset_file.seek(0)
def score_line(self, line, vocabulary): """Scores a line of text. Start-of-sentence and end-of-sentece tags (``<s>`` and ``</s>``) will be inserted at the beginning and the end of the line, if they're missing. If the line is empty, ``None`` will be returned, instead of interpreting it as the empty sentence ``<s> </s>``. ``<unk>`` tokens will be excluded from the probability computation, if the constructor was given ``exclude_unk=True``. When using a shortlist, OOV words are always excluded, and if ``exclude_unk=True`` was given, OOS words are also excluded. Words with zero class membership probability are always excluded. :type line: str :param line: a sequence of words :type vocabulary: Vocabulary :param vocabulary: vocabulary for converting the words to word IDs :rtype: float :returns: log probability of the word sequence, or None if the line is empty """ words = utterance_from_line(line) if not words: return None word_ids = vocabulary.words_to_ids(words) unk_id = vocabulary.word_to_id['<unk>'] self.num_words += word_ids.size self.num_unks += numpy.count_nonzero(word_ids == unk_id) class_ids = [ vocabulary.word_id_to_class_id[word_id] for word_id in word_ids ] probs = [vocabulary.get_word_prob(word_id) for word_id in word_ids] return self.score_sequence(word_ids, class_ids, probs)
def compute_word_counts(input_files): """Computes word unigram counts using word strings. This method does not expect a vocabulary. Start and end of sentence markers are not added. Leaves the input files pointing to the beginning of the file. :type input_files: list of file or mmap objects :param input_files: input text files :rtype: dict :returns: a mapping from word strings to counts """ result = dict() for subset_file in input_files: for line in subset_file: for word in utterance_from_line(line): if word not in result: result[word] = 1 else: result[word] += 1 subset_file.seek(0) return result
def compute_probs(self, input_files): """Recomputes unigram class membership probabilities from text files. Probabilities are updates only for classes whose words occur in the text. Ensures that special tokens will always have nonzero probabilities. :type input_files: list of file or mmap objects :param input_files: input text files """ counts = numpy.zeros(self.num_words(), dtype='int64') for subset_file in input_files: for line in subset_file: for word in utterance_from_line(line): if word in self.word_to_id: counts[self.word_to_id[word]] += 1 sos_id = self.word_to_id['<s>'] eos_id = self.word_to_id['</s>'] unk_id = self.word_to_id['<unk>'] counts[sos_id] = max(counts[sos_id], 1) counts[eos_id] = max(counts[eos_id], 1) counts[unk_id] = max(counts[unk_id], 1) for cls in self._word_classes: cls_counts = dict() for word_id, _ in cls: cls_counts[word_id] = counts[word_id] cls_total = sum(cls_counts.values()) if cls_total > 0: for word_id, count in cls_counts.items(): cls.set_prob(word_id, float(count) / cls_total) else: prob = 1.0 / len(cls) for word_id, _ in cls: cls.set_prob(word_id, prob)
def _score_utterances(input_file, vocabulary, scorer, output_file, log_base=None): """Reads utterances from ``input_file``, computes LM scores using ``scorer``, and writes one score per line to ``output_file``. Start-of-sentence and end-of-sentece tags (``<s>`` and ``</s>``) will be inserted at the beginning and the end of each utterance, if they're missing. Empty lines will be ignored, instead of interpreting them as the empty sentence ``<s> </s>``. :type input_file: file object :param input_file: a file that contains the input sentences in SRILM n-best format :type vocabulary: Vocabulary :param vocabulary: vocabulary that provides mapping between words and word IDs :type scorer: TextScorer :param scorer: a text scorer for rescoring the input sentences :type output_file: file object :param output_file: a file where to write the output n-best list in SRILM format :type log_base: int :param log_base: if set to other than None, convert log probabilities to this base """ log_scale = 1.0 if log_base is None else numpy.log(log_base) unk_id = vocabulary.word_to_id['<unk>'] num_words = 0 num_unks = 0 for line_num, line in enumerate(input_file): words = utterance_from_line(line) if not words: continue word_ids = vocabulary.words_to_ids(words) num_words += word_ids.size num_unks += numpy.count_nonzero(word_ids == unk_id) class_ids = [vocabulary.word_id_to_class_id[word_id] for word_id in word_ids] probs = [vocabulary.get_word_prob(word_id) for word_id in word_ids] lm_score = scorer.score_sequence(word_ids, class_ids, probs) lm_score /= log_scale output_file.write(str(lm_score) + '\n') if (line_num + 1) % 1000 == 0: print("{0} sentences scored.".format(line_num + 1)) sys.stdout.flush() if num_words == 0: print("The input file contains no words.") else: print("{0} words processed, including start-of-sentence and " "end-of-sentence tags, and {1} ({2:.1f} %) out-of-vocabulary " "words".format(num_words, num_unks, num_unks / num_words))