def _compile_languagemodel(self, text, output_file): """ Compiles the languagemodel from a text. Arguments: text -- the text the languagemodel will be generated from output_file -- the path of the file this languagemodel will be written to Returns: A list of all unique words this vocabulary contains. """ with tempfile.NamedTemporaryFile(suffix='.vocab', delete=False) as f: vocab_file = f.name # Create vocab file from text self._logger.debug("Creating vocab file: '%s'", vocab_file) cmuclmtk.text2vocab(text, vocab_file) # Create language model from text self._logger.debug("Creating languagemodel file: '%s'", output_file) cmuclmtk.text2lm(text, output_file, vocab_file=vocab_file) # Get words from vocab file self._logger.debug("Getting words from vocab file and removing it " + "afterwards...") words = [] with open(vocab_file, 'r') as f: for line in f: line = line.strip() if not line.startswith('#') and line not in ('<s>', '</s>'): words.append(line) os.remove(vocab_file) return words
def compile_languagemodel(text, output_file): """ Compiles the languagemodel from a text. Arguments: text -- the text the languagemodel will be generated from output_file -- the path of the file this languagemodel will be written to Returns: A list of all unique words this vocabulary contains. """ if len(text.strip()) == 0: raise ValueError('No text to compile into languagemodel!') logger = logging.getLogger(__name__) with tempfile.NamedTemporaryFile(suffix='.vocab', delete=False) as f: vocab_file = f.name # Create vocab file from text logger.debug("Creating vocab file: '%s'", vocab_file) cmuclmtk.text2vocab(text, vocab_file) # Get words from vocab file logger.debug("Getting words from vocab file and removing it " + "afterwards...") words = [] with open(vocab_file, 'r') as f: for line in f: line = line.strip() if not line.startswith('#') and line not in ('<s>', '</s>'): words.append(line) if len(words) == 0: logger.warning('Vocab file seems to be empty!') # Create language model from text logger.debug("Creating languagemodel file: '%s'", output_file) cmuclmtk.text2lm(text, output_file, vocab_file=vocab_file) # Remote the vocab file os.remove(vocab_file) return words