synth.py

__author__ = 'B083126'
# Developed with monophones
import sys
import os
import SimpleAudio as SA
import argparse
from nltk.corpus import cmudict
import numpy as np
import re
import datetime as dt
# Arguments
parser = argparse.ArgumentParser(
    description='A basic text - to - speech app that synthesises an input phrase using monophone unit selection.')
parser.add_argument('--monophones', default="monophones", help="Folder containing monophone wavs")
parser.add_argument('--play', '-p', action="store_true", default=False, help="Play the output audio")
parser.add_argument('--outfile', '-o', action="store", dest="outfile", type=str, help="Save the output audio to a file",
                    default=None)
parser.add_argument('phrase', nargs='+', help="The phrase to be synthesised")
# Arguments for extensions
parser.add_argument('--spell', '-s', action="store_true", default=False,
                    help="Spell the phrase instead of pronouncing it")
parser.add_argument('--volume', '-v', default=None, type=float,
                    help="A float between 0.0 and 1.0 representing the desired volume")
args = parser.parse_args()


def type_check(var_types):
    """ Takes a list of variables with their required types, and raises a TypeError exception if the type does not match.
    Each variable may also accept None value, if the third tuple element passes True.


    Args:
        var_type (list of tuple<str, type, bool>): List of variable, their required type and
                                                    thirdly whether or not they can accept None


    Raises:
        TypeError (if a variable does not match its required type)
    """
    v, t, accept_none = zip(*var_types)  # Split list of tuples into vars and types
    t_real = tuple([type(i) for i in v])

    for i in xrange(len(v)):
        condition = isinstance(v[i], t[i])
        if accept_none[i]:
            condition = condition or (v[i] == None)
        if not condition:
            raise TypeError(str(v[i]) + " is not " + str(t[i]))


class Synth(object):
    """ Class for a speech synthesiser object which generates, plays and saves a speech sequence, given a list of phonemes.

    Stores an output audio file representation, a list of phoneme files
    and a word-phoneme dictionary; with additional methods for speech synthesisation.

    Attributes:
        out (Audio): The output, as generated by the synthesiser as a wave format Audio object.
        phones (dict of str : Audio): Dictionary of audio sequences for each phoneme, as taken from a folder of wave files.
        word_phones_dict (dict of str : list<unicode strings> ): The Carnegie Mellon (CMU) Pronouncing Dictionary, contains a list of phoneme sequences for a large lexicon of English words.
    """


    def __init__(self, wav_folder):
        self.out = SA.Audio(rate=16000) # Create a blank audio for output, with a frequency of 16000

        self.phones = self.get_wavs(wav_folder) # Add wavs as audio objects for each phoneme
                                                # and additional elements for pause breaks

        self.add_phone_break('comma - break', 250)
        self.add_phone_break('sentence - break', 500)

        self.word_phones_dict = cmudict.dict()


    def get_wavs(self, wav_folder):
        """ Reads and stores wave files from a given folder

        Looks in a dictionary and checks for each .wav file.
        Creates an Audio object, loaded from the given wav file.
        Each Audio is then appended to a dictionary to be returned.

        Args:
            wav_folder (str): The filepath to read from.

        Returns:
            wavs: A dictionary of all waves as Audio objects mapped to a string key based on their filename, in format {filename: Audio}
        """

        type_check([(wav_folder, str, False)])

        wavs = {}

        for root, dirs, files in os.walk(wav_folder, topdown=False):
            for file in files:
                name, ext = os.path.splitext(file)
                if ext.lower() == '.wav':
                    audio = SA.Audio()
                    audio.load(wav_folder + '/' + file)
                    wavs.update({name: audio})
        return wavs


    def add_phone_break(self, name, length, frequency=16000):
        """ Creates an Audio object representing a pause of a given length, adding it as a dictionary element to the phones.

        Calculates the sample rate for a millisecond.
        Creates an Audio object, and fills it with a numpy array of zeros.
        The number of zeros is calculated as the length * ms sample rate.
        The Audio object is then added to the phones dictionary

        Args:
            name (str): A string name to be the dictionary index
            length (int): The length of the break in ms
            frequency (int - optional) : Optional integer value for frequency, defaults to 16000
        """
        type_check([(name, str, False), (length, int, False), (frequency, int, False)])

        ms = frequency / 1000  # sample rate for miliseconds
        audio = SA.Audio()
        audio.data = np.zeros(length * ms, np.int16)

        self.phones.update({name: audio})


    def concat_phone_seq(self, phone_seq):
        """ Takes a sequence of phonemes and concantenates them as the output data.
        Creates a tuple containing the data for each phoneme in the sequence.
        The sequence is then concentenated as a numpy array.
        The output audio object is then updated to store the concatenation.

        Args:
            phone_seq (list of str): A sequence of phoneme strings, each mapped to an element in the phones dict
        """
        type_check([(phone_seq, list, False)])
        datas = tuple([self.phones[p].data for p in phone_seq])
        self.out.data = np.concatenate(datas)


    def play_and_save(self, play=False, volume=None, saveout=None):
    """ Play or save the audio output

    Args:
        play (bool -optional): Boolean option whether to play the audio or not. Defaults to False
        volume (float): Float value for play volume. Will only be set if play is True.
        saveout (str -optional): A string filename without file extension. Defaults to None (in which case no file will be created)
    """
    type_check([(play, bool, False), (saveout, str, True)])  # saveout may also accept None

    if play:
        self.adjust_volume(volume)
        self.out.play()

    if saveout != None:
        self.out.save(saveout)
        print "Saved audio sequence as " + saveout + ".wav"


    def adjust_volume(self, volume):
        """ Adjust the volume of the audio output

        Args:
            volume (float): Float value for new volume. Must range between 0.0 and 1.0 inclusive.
                Note: Integer and other number values may be accepted if they can be cast to float.
        """
    if volume != None:
        try:
            float(volume)
        except ValueError:
            raise ValueError(volume + " must be a float")

        if 0.0 < volume > 1.0:
            raise ValueError(volume + " must range between 0.0 and 1.0 inclusive")

        self.out.rescale(volume)


''' Text processing and normalisation methods
These methods are outside the scope of the Synth class, as they process text before it is input into the synthesiser.
* The synthesiser will only take in a list phoneme sequence, so any text must be converted to this format first.
* There are also methods to normalise text(including transforming numbers and dates into spoken text), before
the phoneme conversion.
'''

def normalise_text(tokens, spell=False):
    """ Given a phrase, normalise this into a standard format. Includes options for spelling, dates and numbers.

    Performs a number of regex operations to transform the text phrase including stripping of
    any excess punctuation, lowercasing all letters and looking for any date patterns to normalise.
    Will also normalise all integers and floating point numbers as text.
    Tokenises words and punctuation, if spelling is on tokenises all characters.
    Tokens are returned as a list.

    Args:
        tokens (list of strings): An initial list of individual tokens which together form a phrase
        spell (bool -optional): Optional parameter to normalise using spelling rules
                                    - ie tokenising by character. Defaults to false
    Return:
        list of str: List of normalised tokens, which must be converted into a
                        phoneme sequence before the Synthesiser can read it.
    """
    type_check([(tokens, list, False), (spell, bool, False)])

    # Will be performing regex operations on a string, so join these up with
    # spaces for now.
    phrase = ' '.join(tokens)

    # We only want to keep alphanumberic, forward slashes, commas, fullstops, question and exclaimation marks.
    # We will also keep apostrophes if they are in the middle of a word, though any other apostrophes will be stripped.

    phrase = re.sub(r"\s'|'\s", "", phrase)  # Remove any non-infixed apostrophe
    phrase = re.sub(r"[^\w\s\.,?!'/]", "", phrase).lower()    # Remove extra punctuation and lowercase.

    # Check for date patterns to normalise
    ddmm = r'(\b\d{2} /\d{2}\b)'
    ddmmyy = r'(\b\d{2} /\d{2} /\d{2}\b)'
    ddmmyyyy = r'(\b\d{2} /\d{2} /\d{4}\b)'
    phrase = re.sub(ddmmyyyy + r'|'+ ddmmyy + r'|'+ ddmm, lambda x: normalise_date(x.group(), spell), phrase)

    # Normalise floating point numbers
    phrase = re.sub(r'\d +\.\d +', lambda x: normalise_number(x.group()), phrase)

    # If spelling is on: Add in extra pause breaks between words and split token by letter
    if spell:
        # Add a temp pause character (denoted as ';') after each space
        phrase = re.sub(r'\s +', ';', phrase)

        # Surround any letters and numbers with spaces (to split them)
        phrase = re.sub(r'(\w)', r' \1 ', phrase)

        # Now free to replace any temp ';' with a ','. As punctuation commas have
        # been pre-ambiguated
        phrase = re.sub(r';', ', ', phrase)
    else:
        # Seperate any non-word punctuation (Not apostrophes, as they are part of a word)
        phrase = re.sub(r'([\., !?/ ])', r' \1 ', phrase)


    # Normalise integer numbers (already did floats, doing integers now incase
    # any were left over from dates.)
    phrase = re.sub(r'\d +', lambda x: normalise_number(x.group()), phrase)

    # Remove any extra spaces if there are any
    phrase = re.sub(r'\s{2, }', ' ', phrase)

    # Split back into tokens
    tokens = phrase.strip().split(' ')

    return tokens


def normalise_number(match):
    """ Normalise integer and floating point numbers into a spoken word sequence.

    Distinguishes between floating point and integers and calls a recursive function as appropriate
    to build up a spoken text representation of each part. The text representation is then returned.

    Args:
        match (str/int): A string of numbers which have been matched. Will accept int, but will be cast to string.
            Note: We store this as a string so it is easy to seperate each digit and inorder to keep any trailing 0s

    Return:
        str: Spoken text normalisation of the number
    """
    if isinstance(match, int):
        match = str(match)

    type_check([(match, str, False)])


    def num_to_word(nums, pre="", split_digits=False):
        """ Recursive function which evaluates a string of digits, left to right one character at a time;
        and appends this to a string before returning.

        Follows different rules depending on position of the digit (Hundreds, Tens, Units)
            and specific cases of the numbers present.
        If split_digits is on, will evaluate each digit as a unit.
        Note: Numbers over 3 digits or beginning with 0 will be evaluated as split_digits.

        Args:
            nums (str): A string of numbers still left to be evaluated.
            pre (str): A string 'prefix', of the the currently evaluated text representation.
            split_digits (bool -optional): Option to evaluate all digits as units.

        Return:
            Recursive call on itself with the remaining numbers and the prefix as evaluated so far until eventually:-
                str: Final evaluation of the full number as a string words.
        """
        type_check([(nums, str, False), (pre, str, False),
                    (split_digits, bool, False)])

        # Ordered list of unit numbers in word form.
        units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']

        # Dictionaries for special cases
        xty = {0: '', 1: 'ten', 2: 'twenty', 3: 'thirty', 4: 'forty', 5: 'fifty', 8: 'eighty'}
        xteen = {1: 'eleven', 2: 'twelve', 3: 'thirteen', 5: 'fifteen', 8: 'eighteen'}


        def get_xty(t, pre=""):
            """ Append the name for the tens digit to pre.

            Will find the name for t and append a 'ty',
            unless there's a special rule in the xty dictionary instead.

            Args:
                t (int): The tens digit
                pre (str): A string 'prefix', of the the currently evaluated text representation.
            """
            type_check([(t, int, False), (pre, str, False)])

            if t in xty:
                pre += xty[t]
            else:
                pre += units[t] + 'ty'
            return pre.rstrip()


        def get_xteen(u, pre=""):
            """ Append the of the correct teen number to pre.

            Will find the name for u and append a 'teen',
            unless there's a special rule in the xteen dictionary instead.

            Args:
                u (int): The units digit
                pre (str): A string 'prefix', of the the currently evaluated text representation.
            """
            type_check([(t, int, False), (pre, str, False)])

            if u in xteen:
                pre += xteen[u]
            else:
                pre += units[u] + 'teen'
            return pre.rstrip()

        # Boolean check that the length is inrange not to be a unit evaluation
        nonunit_range = len(nums) in [2, 3]

        # Boolean check for trailing 0s (at first iteraton, with an empty pre)
        trailing_zero = (pre == '') and (nums[0] == '0')

        if trailing_zero or not nonunit_range:
            split_digits = True  # Will evaluate as units

        if split_digits:
            # When evaluating units, just look up each digit and append with a space
            for d in nums:
                pre += units[int(d)] + ' '
            return pre.rstrip()

        else:
            # Evaluating the tens
            if len(nums) == 2:
                t, u = int(nums[0]), int(nums[1])

                # Special terminating case for when ending in 0 (ie multiple of ten,
                # X-ty), as no more units to evaluate.
                if u == 0:
                    return get_xty(t, pre)

                # Special terminating case for teens, as nothing else left to evaluate
                elif t == 1:
                    return get_xteen(u, pre)  # special case: teens can terminate as is.

                # Normal case: Evaluate tens (+ty), append to pre and call recursively with remaining digit
                else:
                    pre = get_xty(t, pre) + ' '
                    return num_to_word(nums[1:], pre)

            # Evaluating the hundreds
            elif len(nums) == 3:
                h, t, u = int(nums[0]), int(nums[1]), int(nums[2])

                pre += units[h] + ' hundred'

                # Special terminating case: Return pre when tens and units are 0 (No "one
                # hundred and zero")
                if t == u == 0:
                    return pre.rstrip()

                # Normal case: Append an 'and' to pre and call recursively with remaining digit
                else:
                    pre += ' and '
                    return num_to_word(nums[1:], pre)


    match = str(match)
    is_float = '.' in match  # Match is considered a float if it contains a decimal point

    # If float, evaluate each segment individually
    if is_float:
        i = match.split('.')[0]
        f = match.split('.')[1]

        integ = num_to_word(i)
        fract = num_to_word(f, split_digits=True)

        return integ + ' point ' + fract

    else:
        return num_to_word(match)


def normalise_date(match, spell=True):
    """ Normalise dates into a spoken word sequence.

    Matches a number of different date formats.
    If there are only two digits in the year, find the closest year.
    Will use the datetime library to check whether a given date is valid.
    And if it is will proceed to translate into English, by finding the month and ordinal name
        for the day.
    The year will also be translated into words, based on general English language conventions.
    Will finally join these segments together to form a full date in words.

    Args:
        match (str): A string of dates which have been matched. Should be in the formats: 'dd/mm', 'dd/mm/yy' or 'dd/mm/yyyy'
                    Note: Dates with only single digit dates or months will not match, there must be a trailing 0 to match the 2-digit format.
        spell (bool): If spell is True, separate each character instead with no text conversion.

    Return:
        str: Spoken text normalisation of the number
    """
    type_check([(match, str, False), (spell, bool, False)])


    def get_ordinal(num):
        """ Get the ordinal name for a number.

        Given any integer, find its name and follow the rules of English to
        append an ordinal suffix correctly onto it.

        Args:
            num (int): A given number, all integers upto 99 will be evaluated correctly

        Return:
            str: The input as a written ordinal number
        """
        type_check([(num, int, False)])

        # All special cases
        ordinals = {0: '', 1: 'first', 2: 'second', 3: 'third', 5: 'fifth', 8: 'eighth', 9: 'ninth',
                    12: 'twelfth', 20: 'twentieth', 30: 'thirtieth'}

        # Floor number, essentially replacing the last digit with 0.
        #   This part does not generally need suffixing
        floor = num / 10 * 10

        # Initialise the units, this will be the part where suffix is appended
        unit = num

        # This is essentially the case for non-multiples of ten and ten itself (no 20, 30, ...)"
        if floor > 10 and unit != floor:
            ordinal_day = normalise_number(floor) + ' '  # Initialise with the floor
            unit = num - floor  # Just keep the units for suffixing

        else:
            ordinal_day = ' '  # As floor is not being evaluated, initialise as space

        # If a special rule, follow it and append
        if unit in ordinals:
            ordinal_day += ordinals[unit]

        # Otherwise stick a "th" onto the unit name, and append that
        else:
            ordinal_day += normalise_number(unit) + 'th'

        return ordinal_day


    def get_closest_year(num):
        """ Given the last two digits of a year, find the closest 4-digit year

        Given the ending digits, add it to the current, previous and next century to get three years.
        Compare the distance between the year today and each of the 3 years, to decide on the closest.

        Args:
            num (int): A given integer number.

        Return:
            int: The closest year.
        """
        type_check([(num, int, False)])

        current_year = dt.datetime.now().year  # Today's year
        current_century = current_year / 100 * 100  # The last centurial year

        possibilities = [current_century + int(num)]
        possibilities.extend([possibilities[0] + 100, possibilities[0] - 100])

        # Find absolute distances for each possibility
        diff = [abs(possible - current_year) for possible in possibilities]
        closest = diff.index(min(diff))

        return str(possibilities[closest])


    def normalise_year(year):
        """ Given a four digit integer year, normalise this in spoken words following English language conventions.

        Follows a number of rules to generate dates such as:
            "two thousand and one", "nineteen hundred", "nineteen oh two",
             "nineteen eighty four", "twenty fifteen"
        Any dates not in a four digit format will be normalised as a normal integer.

        Args:
            num (str): The given year as an integer
        Return:
            str: Spoken text normalisation of the year.
        """
        type_check([(year, str, False)])

        # These rules follow for full four digit years
        if len(year) == 4:
            year = int(year)

            # Split into two halves: the century and the units.
            century = year / 100
            units = year - (century * 100)

            # Normalise the units as normal
            words_units = normalise_number(units)

            # Thousand rule (eg "X thousand and Y" where X is any thousand except 1000 and Y is 0..9 )
            thousand_rule = century > 10 and (century % 10 == 0) and units < 10

            if thousand_rule:
                words_century = normalise_number(century / 10) + ' thousand'

                # Only append "and Y" when units are not 0
                if units != '0':
                    concat = words_century + ' and ' + words_units

            else:  # eg "# nineteen hundred, nineteen oh two, nineteen eighty four / twenty fifteen"
                # Normalise century as usual
                words_century = normalise_number(century)
                if units == 0:
                    concat = words_century + ' hundred'
                elif 0 < units < 10:
                    concat = words_century + ' oh ' + words_units
                else:
                    concat = words_century + ' ' + words_units

            return concat

        else: #When year is not 4 digits, just normalise as general number
            return normalise_number(year)


    # If spelling is on, just separate each character and return (as we don't
    # want to spell out the text of dates)
    if spell:
        return " ".join(match)
    months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
    date_segments = match.split('/')

    # Get numbers for day and month already
    day, month = date_segments[0], date_segments[1]

    # We'll try to do the same with year
    try:
        year = date_segments[2]
        if len(year) == 2:
            year = get_closest_year(int(year))
    # But we don't always capture year, so then store None
    except IndexError:
        year = None

    if year != None:
        test_date = day +'/'+ month +'/'+ year # A test date in a standard format dd/mm/yyyy for the strptime function

    else:
        # Strptime still requires a placeholde year
        # Set it to a leap year (2016) so 29th Feb can be a valid possibility.
        test_date = match + '/' + '2016'

    # Date validation , try to test the date
    try:
        dt_date = dt.datetime.strptime(test_date, '% d / %m / %Y')

    # If one of the values is out of range, this is not a valid date
    except ValueError:
        # Return the match as it is, to be be normalised as numbers and slashes instead
        return match

    day = get_ordinal(int(day))
    month = months[int(month) - 1]     # Months can also taken from dt.strptime(), but only for years from 1900

    concat = 'the ' + day + ' of ' + month

    # Only add the year if we have one
    if year != None:
        year = normalise_year(year)
        concat += ' '+ year

    return concat


def get_phone_seq(tokens, word_phones_dict, override_phones={}):
    """ Converts a list of tokens into a phoneme sequence, which can be read into the Synthesiser

    Iterates through each token in turn, looking it up in the Synthesiser's word-phoneme dictionary as provided.
     Then takes the first phoneme list found, and appends this to our sequence.
      If a word is not found here, an error message will be returned and the program will exit.

    As well as looking in the Synthesiser's dictionary, a special dictionary of override rules can also be checked.
     There are also checks for punctuation breaks, which are known to the Synthesiser. These can be checked in lieu of a
      phoneme sequence, to insert a pre-defined break in the synthesiser's speech process.

    Args:
        tokens (list of str): List of tokens, which have been previously normalised
        word_phones_dict (dict of str : list<unicode strings> ) : A dictionary of words with their phoneme sequences.
                                                                    In this case the Carnegie Mellon (CMU) Pronouncing
                                                                     Dictionary, as provided in the Synth class.

        override_phones (dict str : list<unicode strings> -optional ) : An additional dictionary, provided extra and
                                                                            additional rules which can override those in
                                                                             the default dictionary.
                                                                        Defaults to an empty dictionary
    Return:
        list of str: A sequence of phoneme strings, which the synthesiser can read.
    """
    type_check([(tokens, list, False), (word_phones_dict,
                                        dict, False), (override_phones, dict, False)])
    terminating_punc = ['.', '!', '?']
    phone_seq = []

    special_exists = override_phones != None

    for token in tokens:
        # Ignore any empty strings, if any remain
        if token == "":
            pass

        # Look in special phonem rules first
        elif token in override_phones:
            phones = override_phones[token]

        # If ',': we will add a 250ms break, as stored in Synth class phones dict
        elif token == ', ':
            phones = ['comma - break']

        # Likewise, 500ms break at end of sentence, as stored in Synth class phones dict
        elif token in terminating_punc:
            phones = ['sentence - break']

        # Otherwise check in the CMU phonetic dictionary (as stored in the Synth class)
        elif token in word_phones_dict:
            # Automatically just pick the first phonem sequence
            phones = word_phones_dict[token][0]
        else:
            print "Sorry, '" + token + "' is not in the CMU dictionary."
            sys.exit()

        phone_seq.extend(phones)

    cleaned_seq = [re.sub(r"[\d]+", "", phonem).lower() for phonem in phone_seq] #Clean out phoneme stress digits
    return cleaned_seq

if __name__ == "__main__":

    # Special dictionary to override or add additional phoneme sequence rules
    # on top of the CMU dictionary stored in Synth class
    special_dict = {
                        # Override default 'Z' sound
                        'z': [u'Z', u'EH1', u'D'],

                        # When pronouncing individual punctuation (during spelling)
                        '[, ]': [u'K', u'AA1', u'M', u'AH0'],
                        '[.]': [u'F', u'UH1', u'L', u'S', u'T', u'AA1', u'P'],
                        '[?]': [u'K', u'W', u'EH1', u'S', u'CH', u'AH0', u'N', u'M', u'AA1', u'R', u'K'],
                        '[!]': [u'IH0', u'K', u'S', u'K', u'L', u'AH', u'M', u'EY1', u'SH', u'AH0', u'N', u'M', u'AA1', u'R', u'K'],
                        "[']": [u'AH0', u'P', u'AA1', u'S', u'T', u'R', u'AH0', u'F', u'IY0'],
                        "[/]": [u'S', u'L', u'AE1', u'SH'],

                        # Slashes are pronounced regardless of spelling, such as for invalid dates
                        "/": [u'S', u'L', u'AE1', u'SH']
                    }
    # Create Synth object, with phonemes from monophones folder
    S = Synth(wav_folder=args.monophones)

    # Phrase -> phoneme sequence
    normalised_tokens = normalise_text(args.phrase, spell=args.spell)
    phone_seq = get_phone_seq(normalised_tokens, S.word_phones_dict, special_dict)

    # If not playing, just print the output.
    if not args.play:
        print "Phoneme sequence: \n \t" + str(phone_seq)

    # Pass phoneme sequence to S
    S.concat_phone_seq(phone_seq)
    S.play_and_save(args.play, args.volume, args.outfile)