Example #1
0
    def _compile_languagemodel(self, text, output_file):
        """
        Compiles the languagemodel from a text.

        Arguments:
            text -- the text the languagemodel will be generated from
            output_file -- the path of the file this languagemodel will
                           be written to

        Returns:
            A list of all unique words this vocabulary contains.
        """
        with tempfile.NamedTemporaryFile(suffix='.vocab', delete=False) as f:
            vocab_file = f.name

        # Create vocab file from text
        self._logger.debug("Creating vocab file: '%s'", vocab_file)
        cmuclmtk.text2vocab(text, vocab_file)

        # Create language model from text
        self._logger.debug("Creating languagemodel file: '%s'", output_file)
        cmuclmtk.text2lm(text, output_file, vocab_file=vocab_file)

        # Get words from vocab file
        self._logger.debug("Getting words from vocab file and removing it " +
                           "afterwards...")
        words = []
        with open(vocab_file, 'r') as f:
            for line in f:
                line = line.strip()
                if not line.startswith('#') and line not in ('<s>', '</s>'):
                    words.append(line)
        os.remove(vocab_file)

        return words
    def _compile_languagemodel(self, text, output_file):
        """
        Compiles the languagemodel from a text.

        Arguments:
            text -- the text the languagemodel will be generated from
            output_file -- the path of the file this languagemodel will
                           be written to

        Returns:
            A list of all unique words this vocabulary contains.
        """
        with tempfile.NamedTemporaryFile(suffix='.vocab', delete=False) as f:
            vocab_file = f.name

        # Create vocab file from text
        self._logger.debug("Creating vocab file: '%s'", vocab_file)
        cmuclmtk.text2vocab(text, vocab_file)

        # Create language model from text
        self._logger.debug("Creating languagemodel file: '%s'", output_file)
        cmuclmtk.text2lm(text, output_file, vocab_file=vocab_file)

        # Get words from vocab file
        self._logger.debug("Getting words from vocab file and removing it " +
                           "afterwards...")
        words = []
        with open(vocab_file, 'r') as f:
            for line in f:
                line = line.strip()
                if not line.startswith('#') and line not in ('<s>', '</s>'):
                    words.append(line)
        os.remove(vocab_file)

        return words
Example #3
0
def compile_languagemodel(text, output_file):
    """
    Compiles the languagemodel from a text.

    Arguments:
        text -- the text the languagemodel will be generated from
        output_file -- the path of the file this languagemodel will
                       be written to

    Returns:
        A list of all unique words this vocabulary contains.
    """
    if len(text.strip()) == 0:
        raise ValueError('No text to compile into languagemodel!')

    logger = logging.getLogger(__name__)

    with tempfile.NamedTemporaryFile(suffix='.vocab', delete=False) as f:
        vocab_file = f.name

    # Create vocab file from text
    logger.debug("Creating vocab file: '%s'", vocab_file)
    cmuclmtk.text2vocab(text, vocab_file)

    # Get words from vocab file
    logger.debug("Getting words from vocab file and removing it " +
                 "afterwards...")
    words = []
    with open(vocab_file, 'r') as f:
        for line in f:
            line = line.strip()
            if not line.startswith('#') and line not in ('<s>', '</s>'):
                words.append(line)

    if len(words) == 0:
        logger.warning('Vocab file seems to be empty!')

    # Create language model from text
    logger.debug("Creating languagemodel file: '%s'", output_file)
    cmuclmtk.text2lm(text, output_file, vocab_file=vocab_file)

    # Remote the vocab file
    os.remove(vocab_file)

    return words
Example #4
0
def create_languagemodel(text, output_file):
    """
        Creates the languagemodel from text, returns a list of words in vocabulary
    """
    with tempfile.NamedTemporaryFile(suffix='.vocab', delete=False) as f:
        vocab_file = f.name

    # Create vocab file from text
    cmuclmtk.text2vocab_file(text, vocab_file)
    # Create language model from text
    cmuclmtk.text2lm(text, output_file, vocab_file=vocab_file)
    words = []
    with open(vocab_file,'r',) as f:
        for line in f:
            line = line.strip()
            if not line.startswith('#') and not line in ('<s>','</s>'):
                words.append(line)
    os.remove(vocab_file)
    # return used vocabulary
    return words
Example #5
0
def create_languagemodel(text, output_file):
    """
        Creates the languagemodel from text, returns a list of words in vocabulary
    """
    with tempfile.NamedTemporaryFile(suffix='.vocab', delete=False) as f:
        vocab_file = f.name

    # Create vocab file from text
    cmuclmtk.text2vocab_file(text, vocab_file)
    # Create language model from text
    cmuclmtk.text2lm(text, output_file, vocab_file=vocab_file)
    words = []
    with open(
            vocab_file,
            'r',
    ) as f:
        for line in f:
            line = line.strip()
            if not line.startswith('#') and not line in ('<s>', '</s>'):
                words.append(line)
    os.remove(vocab_file)
    # return used vocabulary
    return words