Beispiel #1
0
def extract_numbers(text, short_scale=True, ordinals=False, lang=None):
    """
        Takes in a string and extracts a list of numbers.

    Args:
        text (str): the string to extract a number from
        short_scale (bool): Use "short scale" or "long scale" for large
            numbers -- over a million.  The default is short scale, which
            is now common in most English speaking countries.
            See https://en.wikipedia.org/wiki/Names_of_large_numbers
        ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
        lang (str): the BCP-47 code for the language to use, None uses default
    Returns:
        list: list of extracted numbers as floats, or empty list if none found
    """
    lang_code = get_primary_lang_code(lang)
    if lang_code == "en":
        return extract_numbers_en(text, short_scale, ordinals)
    elif lang_code == "de":
        return extract_numbers_de(text, short_scale, ordinals)
    elif lang_code == "fr":
        return extract_numbers_fr(text, short_scale, ordinals)
    elif lang_code == "it":
        return extract_numbers_it(text, short_scale, ordinals)
    elif lang_code == "da":
        return extract_numbers_da(text, short_scale, ordinals)
    # TODO: extractnumbers_xx for other languages
    _log_unsupported_language(lang_code, ['en', 'it', 'fr', 'de', 'da'])
    return []
Beispiel #2
0
def get_gender(word, context="", lang=None):
    """ Guess the gender of a word

    Some languages assign genders to specific words.  This method will attempt
    to determine the gender, optionally using the provided context sentence.

    Args:
        word (str): The word to look up
        context (str, optional): String containing word, for context
        lang (str): the BCP-47 code for the language to use, None uses default

    Returns:
        str: The code "m" (male), "f" (female) or "n" (neutral) for the gender,
             or None if unknown/or unused in the given language.
    """

    lang_code = get_primary_lang_code(lang)

    if lang_code in ["pt", "es"]:
        # spanish follows same rules
        return get_gender_pt(word, context)
    elif lang_code == "it":
        return get_gender_it(word, context)
    # TODO: get_gender_xx for other languages
    _log_unsupported_language(lang_code, ['pt', 'it', 'es'])
    return None
Beispiel #3
0
def extract_duration(text, lang=None):
    """ Convert an english phrase into a number of seconds

    Convert things like:
        "10 minute"
        "2 and a half hours"
        "3 days 8 hours 10 minutes and 49 seconds"
    into an int, representing the total number of seconds.

    The words used in the duration will be consumed, and
    the remainder returned.

    As an example, "set a timer for 5 minutes" would return
    (300, "set a timer for").

    Args:
        text (str): string containing a duration
        lang (str): the BCP-47 code for the language to use, None uses default

    Returns:
        (timedelta, str):
                    A tuple containing the duration and the remaining text
                    not consumed in the parsing. The first value will
                    be None if no duration is found. The text returned
                    will have whitespace stripped from the ends.
    """
    lang_code = get_primary_lang_code(lang)

    if lang_code == "en":
        return extract_duration_en(text)

    # TODO: extract_duration for other languages
    _log_unsupported_language(lang_code, ['en'])
    return None
Beispiel #4
0
def pronounce_number(number,
                     lang=None,
                     places=2,
                     short_scale=True,
                     scientific=False):
    """
    Convert a number to it's spoken equivalent

    For example, '5' would be 'five'

    Args:
        number: the number to pronounce
        short_scale (bool) : use short (True) or long scale (False)
            https://en.wikipedia.org/wiki/Names_of_large_numbers
        scientific (bool) : convert and pronounce in scientific notation
    Returns:
        (str): The pronounced number
    """
    lang_code = get_primary_lang_code(lang)
    if lang_code == "en":
        return pronounce_number_en(number,
                                   places=places,
                                   short_scale=short_scale,
                                   scientific=scientific)
    elif lang_code == "it":
        return pronounce_number_it(number,
                                   places=places,
                                   short_scale=short_scale,
                                   scientific=scientific)
    elif lang_code == "es":
        return pronounce_number_es(number, places=places)
    elif lang_code == "fr":
        return pronounce_number_fr(number, places=places)
    elif lang_code == "de":
        return pronounce_number_de(number, places=places)
    elif lang_code == "hu":
        return pronounce_number_hu(number, places=places)
    elif lang_code == "nl":
        return pronounce_number_nl(number, places=places)
    elif lang_code == "da":
        return pronounce_number_da(number, places=places)
    elif lang_code == "pt":
        return pronounce_number_pt(number, places=places)

    # Default to just returning the numeric value
    # TODO: Other languages
    _log_unsupported_language(
        lang_code, ['en', 'es', 'pt', 'it', 'fr', 'de', 'hu', 'nl', 'da'])
    return str(number)
Beispiel #5
0
def nice_number(number, lang=None, speech=True, denominators=None):
    """Format a float to human readable functions

    This function formats a float to human understandable functions. Like
    4.5 becomes 4 and a half for speech and 4 1/2 for text
    Args:
        number (int or float): the float to format
        lang (str): code for the language to use
        speech (bool): format for speech (True) or display (False)
        denominators (iter of ints): denominators to use, default [1 .. 20]
    Returns:
        (str): The formatted string.
    """
    # Convert to spoken representation in appropriate language
    lang_code = get_primary_lang_code(lang)
    if lang_code == "en":
        return nice_number_en(number, speech, denominators)
    elif lang_code == "es":
        return nice_number_es(number, speech, denominators)
    elif lang_code == "pt":
        return nice_number_pt(number, speech, denominators)
    elif lang_code == "it":
        return nice_number_it(number, speech, denominators)
    elif lang_code == "fr":
        return nice_number_fr(number, speech, denominators)
    elif lang_code == "sv":
        return nice_number_sv(number, speech, denominators)
    elif lang_code == "de":
        return nice_number_de(number, speech, denominators)
    elif lang_code == "hu":
        return nice_number_hu(number, speech, denominators)
    elif lang_code == "nl":
        return nice_number_nl(number, speech, denominators)
    elif lang_code == "da":
        return nice_number_da(number, speech, denominators)

    # Default to the raw number for unsupported languages,
    # hopefully the STT engine will pronounce understandably.
    # TODO: nice_number_XX for other languages
    _log_unsupported_language(
        lang_code,
        ['en', 'es', 'pt', 'it', 'fr', 'sv', 'de', 'hu', 'nl', 'da'])
    return str(number)
Beispiel #6
0
def nice_time(dt, lang=None, speech=True, use_24hour=False, use_ampm=False):
    """
    Format a time to a comfortable human format

    For example, generate 'five thirty' for speech or '5:30' for
    text display.

    Args:
        dt (datetime): date to format (assumes already in local timezone)
        lang (str): code for the language to use
        speech (bool): format for speech (default/True) or display (False)
        use_24hour (bool): output in 24-hour/military or 12-hour format
        use_ampm (bool): include the am/pm for 12-hour format
    Returns:
        (str): The formatted time string
    """
    lang_code = get_primary_lang_code(lang)
    if lang_code == "en":
        return nice_time_en(dt, speech, use_24hour, use_ampm)
    elif lang_code == "es":
        return nice_time_es(dt, speech, use_24hour, use_ampm)
    elif lang_code == "it":
        return nice_time_it(dt, speech, use_24hour, use_ampm)
    elif lang_code == "fr":
        return nice_time_fr(dt, speech, use_24hour, use_ampm)
    elif lang_code == "de":
        return nice_time_de(dt, speech, use_24hour, use_ampm)
    elif lang_code == "hu":
        return nice_time_hu(dt, speech, use_24hour, use_ampm)
    elif lang_code == "nl":
        return nice_time_nl(dt, speech, use_24hour, use_ampm)
    elif lang_code == "da":
        return nice_time_da(dt, speech, use_24hour, use_ampm)
    elif lang_code == "pt":
        return nice_time_pt(dt, speech, use_24hour, use_ampm)

    # TODO: Other languages
    _log_unsupported_language(
        lang_code, ['en', 'es', 'pt', 'it', 'fr', 'de', 'hu', 'nl', 'da'])
    return str(dt)
Beispiel #7
0
def extract_number(text, short_scale=True, ordinals=False, lang=None):
    """Takes in a string and extracts a number.

    Args:
        text (str): the string to extract a number from
        short_scale (bool): Use "short scale" or "long scale" for large
            numbers -- over a million.  The default is short scale, which
            is now common in most English speaking countries.
            See https://en.wikipedia.org/wiki/Names_of_large_numbers
        ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
        lang (str): the BCP-47 code for the language to use, None uses default
    Returns:
        (int, float or False): The number extracted or False if the input
                               text contains no numbers
    """
    lang_code = get_primary_lang_code(lang)
    if lang_code == "en":
        return extractnumber_en(text,
                                short_scale=short_scale,
                                ordinals=ordinals)
    elif lang_code == "es":
        return extractnumber_es(text)
    elif lang_code == "pt":
        return extractnumber_pt(text)
    elif lang_code == "it":
        return extractnumber_it(text,
                                short_scale=short_scale,
                                ordinals=ordinals)
    elif lang_code == "fr":
        return extractnumber_fr(text)
    elif lang_code == "sv":
        return extractnumber_sv(text)
    elif lang_code == "de":
        return extractnumber_de(text)
    elif lang_code == "da":
        return extractnumber_da(text)
    # TODO: extractnumber_xx for other languages
    _log_unsupported_language(lang_code,
                              ['en', 'es', 'pt', 'it', 'fr', 'sv', 'de', 'da'])
    return text
Beispiel #8
0
def normalize(text, lang=None, remove_articles=True):
    """Prepare a string for parsing

    This function prepares the given text for parsing by making
    numbers consistent, getting rid of contractions, etc.

    Args:
        text (str): the string to normalize
        lang (str): the BCP-47 code for the language to use, None uses default
        remove_articles (bool): whether to remove articles (like 'a', or
                                'the'). True by default.

    Returns:
        (str): The normalized string.
    """

    lang_code = get_primary_lang_code(lang)

    if lang_code == "en":
        return normalize_en(text, remove_articles)
    elif lang_code == "es":
        return normalize_es(text, remove_articles)
    elif lang_code == "pt":
        return normalize_pt(text, remove_articles)
    elif lang_code == "it":
        return normalize_it(text, remove_articles)
    elif lang_code == "fr":
        return normalize_fr(text, remove_articles)
    elif lang_code == "sv":
        return normalize_sv(text, remove_articles)
    elif lang_code == "de":
        return normalize_de(text, remove_articles)
    elif lang_code == "da":
        return normalize_da(text, remove_articles)
    # TODO: Normalization for other languages
    _log_unsupported_language(lang_code,
                              ['en', 'es', 'pt', 'it', 'fr', 'sv', 'de', 'da'])
    return text
Beispiel #9
0
def extract_datetime(text, anchorDate=None, lang=None, default_time=None):
    """
    Extracts date and time information from a sentence.  Parses many of the
    common ways that humans express dates and times, including relative dates
    like "5 days from today", "tomorrow', and "Tuesday".

    Vague terminology are given arbitrary values, like:
        - morning = 8 AM
        - afternoon = 3 PM
        - evening = 7 PM

    If a time isn't supplied or implied, the function defaults to 12 AM

    Args:
        text (str): the text to be interpreted
        anchorDate (:obj:`datetime`, optional): the date to be used for
            relative dating (for example, what does "tomorrow" mean?).
            Defaults to the current local date/time.
        lang (str): the BCP-47 code for the language to use, None uses default
        default_time (datetime.time): time to use if none was found in
            the input string.

    Returns:
        [:obj:`datetime`, :obj:`str`]: 'datetime' is the extracted date
            as a datetime object in the user's local timezone.
            'leftover_string' is the original phrase with all date and time
            related keywords stripped out. See examples for further
            clarification

            Returns 'None' if no date or time related text is found.

    Examples:

        >>> extract_datetime(
        ... "What is the weather like the day after tomorrow?",
        ... datetime(2017, 06, 30, 00, 00)
        ... )
        [datetime.datetime(2017, 7, 2, 0, 0), 'what is weather like']

        >>> extract_datetime(
        ... "Set up an appointment 2 weeks from Sunday at 5 pm",
        ... datetime(2016, 02, 19, 00, 00)
        ... )
        [datetime.datetime(2016, 3, 6, 17, 0), 'set up appointment']

        >>> extract_datetime(
        ... "Set up an appointment",
        ... datetime(2016, 02, 19, 00, 00)
        ... )
        None
    """

    lang_code = get_primary_lang_code(lang)

    if not anchorDate:
        anchorDate = now_local()

    if lang_code == "en":
        return extract_datetime_en(text, anchorDate, default_time)
    elif lang_code == "es":
        return extract_datetime_es(text, anchorDate, default_time)
    elif lang_code == "pt":
        return extract_datetime_pt(text, anchorDate, default_time)
    elif lang_code == "it":
        return extract_datetime_it(text, anchorDate, default_time)
    elif lang_code == "fr":
        return extract_datetime_fr(text, anchorDate, default_time)
    elif lang_code == "sv":
        return extract_datetime_sv(text, anchorDate, default_time)
    elif lang_code == "de":
        return extract_datetime_de(text, anchorDate, default_time)
    elif lang_code == "da":
        return extract_datetime_da(text, anchorDate, default_time)
    # TODO: extract_datetime for other languages
    _log_unsupported_language(lang_code,
                              ['en', 'es', 'pt', 'it', 'fr', 'sv', 'de', 'da'])
    return text