def extractnumber_en(text, short_scale=True, ordinals=False): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers Args: text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 Returns: (int) or (float) or False: The extracted number or False if no number was found """ string_num_en = { "half": 0.5, "halves": 0.5, "couple": 2, "hundred": 100, "hundreds": 100, "thousand": 1000, "thousands": 1000, "million": 1000000, 'millions': 1000000} string_num_ordinal_en = {} for num in NUM_STRING_EN: num_string = NUM_STRING_EN[num] string_num_en[num_string] = num # first, second... if ordinals: if short_scale: for num in SHORT_ORDINAL_STRING_EN: num_string = SHORT_ORDINAL_STRING_EN[num] string_num_ordinal_en[num_string] = num string_num_en[num_string] = num else: for num in LONG_ORDINAL_STRING_EN: num_string = LONG_ORDINAL_STRING_EN[num] string_num_ordinal_en[num_string] = num string_num_en[num_string] = num # negate next number (-2 = 0 - 2) negatives = ["negative", "minus"] # sum the next number (twenty two = 20 + 2) sums = ['twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety'] # multiply the previous number (one hundred = 1 * 100) multiplies = ["hundred", "thousand", "hundreds", "thousands", "million", "millions"] # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) fraction_marker = [" and "] # decimal marker ( 1 point 5 = 1 + 0.5) decimal_marker = [" point ", " dot "] if short_scale: for num in SHORT_SCALE_EN: num_string = SHORT_SCALE_EN[num] string_num_en[num_string] = num string_num_en[num_string + "s"] = num multiplies.append(num_string) multiplies.append(num_string + "s") else: for num in LONG_SCALE_EN: num_string = LONG_SCALE_EN[num] string_num_en[num_string] = num string_num_en[num_string + "s"] = num multiplies.append(num_string) multiplies.append(num_string + "s") # 2 and 3/4 for c in fraction_marker: components = text.split(c) if len(components) == 2: # ensure first is not a fraction and second is a fraction num1 = extractnumber_en(components[0]) num2 = extractnumber_en(components[1]) if num1 is not None and num2 is not None \ and num1 >= 1 and 0 < num2 < 1: return num1 + num2 # 2 point 5 for c in decimal_marker: components = text.split(c) if len(components) == 2: number = extractnumber_en(components[0]) decimal = extractnumber_en(components[1]) if number is not None and decimal is not None: # TODO handle number dot number number number if "." not in str(decimal): return number + float("0." + str(decimal)) aWords = text.split() aWords = [word for word in aWords if word not in ["the", "a", "an"]] val = False prev_val = None to_sum = [] for idx, word in enumerate(aWords): if not word: continue prev_word = aWords[idx - 1] if idx > 0 else "" next_word = aWords[idx + 1] if idx + 1 < len(aWords) else "" # is this word already a number ? if is_numeric(word): # if word.isdigit(): # doesn't work with decimals val = float(word) # is this word the name of a number ? if word in string_num_en: val = string_num_en[word] # is the prev word an ordinal number and current word is one? # second one, third one if ordinals and prev_word in string_num_ordinal_en and val is 1: val = prev_val # is the prev word a number and should we sum it? # twenty two, fifty six if prev_word in sums and word in string_num_en: if val and val < 10: val = prev_val + val # is the prev word a number and should we multiply it? # twenty hundred, six hundred if word in multiplies: if not prev_val: prev_val = 1 val = prev_val * val # is this a spoken fraction? # half cup if val is False: val = isFractional_en(word, short_scale=short_scale) # 2 fifths if not ordinals: next_value = isFractional_en(next_word, short_scale=short_scale) if next_value: if not val: val = 1 val = val * next_value # is this a negative number? if val and prev_word and prev_word in negatives: val = 0 - val # let's make sure it isn't a fraction if not val: # look for fractions like "2/3" aPieces = word.split('/') if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) else: prev_val = val # handle long numbers # six hundred sixty six # two million five hundred thousand if word in multiplies and next_word not in multiplies: to_sum.append(val) val = 0 prev_val = 0 if val is not None: for v in to_sum: val = val + v return val
def extractnumber_it(text): """ Questa funzione prepara il testo dato per l'analisi rendendo numeri testuali come interi o frazioni. In italiano non è un modo abituale ma può essere interessante per Mycroft E' la versione portoghese riadattata in italiano args: text (str): la stringa da normalizzare Ritorna: (int) o (float): il valore del numero estratto """ aWords = text.split() count = 0 result = None while count < len(aWords): val = 0 word = aWords[count] next_next_word = None if count + 1 < len(aWords): next_word = aWords[count + 1] if count + 2 < len(aWords): next_next_word = aWords[count + 2] else: next_word = None # is current word a number? if word in it_numbers: if word == "mila": val = it_numbers[word] val = result * val result = 0 else: val = it_numbers[word] elif word.isdigit(): # doesn't work with decimals val = int(word) elif is_numeric(word): val = float(word) elif isFractional_it(word): if not result: result = 1 result = result * isFractional_it(word) # "un terzo" is 1/3 but "il terzo" is 3 if aWords[count - 1] == "il": result = 1.0 // isFractional_it(word) count += 1 continue if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) if not val: # cerca numero composto come ventuno ventitre centoventi" val = extractnumber_long_it(word) if val: if result is None: result = 0 # handle fractions # if next_word != "avos": result += val # else: # result = float(result) / float(val) if next_word is None: break # number word and fraction ands = ["e"] if next_word in ands: zeros = 0 if result is None: count += 1 continue newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " afterAndVal = extractnumber_it(newText[:-1]) if afterAndVal: if result < afterAndVal or result < 20: while afterAndVal > 1: afterAndVal = afterAndVal / 10.0 for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break for _ in range(0, zeros): afterAndVal = afterAndVal / 10.0 result += afterAndVal break elif next_next_word is not None: if next_next_word in ands: newWords = aWords[count + 3:] newText = "" for word in newWords: newText += word + " " afterAndVal = extractnumber_it(newText[:-1]) if afterAndVal: if result is None: result = 0 result += afterAndVal break decimals = ["punto", "virgola", ".", ","] if next_word in decimals: zeros = 0 newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break afterDotVal = str(extractnumber_it(newText[:-1])) afterDotVal = zeros * "0" + afterDotVal result = float(str(result) + "." + afterDotVal) break count += 1 if result is None: return False # Return the $str with the number related words removed # (now empty strings, so strlen == 0) # aWords = [word for word in aWords if len(word) > 0] # text = ' '.join(aWords) if "." in str(result): integer, dec = str(result).split(".") # cast float to int if dec == "0": result = int(integer) return result
def extractnumber_sv(text): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number """ aWords = text.split() and_pass = False valPreAnd = False val = False count = 0 while count < len(aWords): word = aWords[count] if is_numeric(word): val = float(word) elif word == "första": val = 1 elif word == "andra": val = 2 elif word == "tredje": val = 3 elif word == "fjärde": val = 4 elif word == "femte": val = 5 elif word == "sjätte": val = 6 elif is_fractional_sv(word): val = is_fractional_sv(word) else: if word == "en": val = 1 if word == "ett": val = 1 elif word == "två": val = 2 elif word == "tre": val = 3 elif word == "fyra": val = 4 elif word == "fem": val = 5 elif word == "sex": val = 6 elif word == "sju": val = 7 elif word == "åtta": val = 8 elif word == "nio": val = 9 elif word == "tio": val = 10 if val: if count < (len(aWords) - 1): wordNext = aWords[count + 1] else: wordNext = "" valNext = is_fractional_sv(wordNext) if valNext: val = val * valNext aWords[count + 1] = "" if not val: # look for fractions like "2/3" aPieces = word.split('/') if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) elif and_pass: # added to value, quit here val = valPreAnd break else: count += 1 continue aWords[count] = "" if and_pass: aWords[count - 1] = '' # remove "och" val += valPreAnd elif count + 1 < len(aWords) and aWords[count + 1] == 'och': and_pass = True valPreAnd = val val = False count += 2 continue elif count + 2 < len(aWords) and aWords[count + 2] == 'och': and_pass = True valPreAnd = val val = False count += 3 continue break if not val: return False return val
def extractnumber_fr(text): """Takes in a string and extracts a number. Args: text (str): the string to extract a number from Returns: (str): The number extracted or the original text. """ # normalize text, keep articles for ordinals versus fractionals text = normalize_fr(text, False) # split words by whitespace aWords = text.split() count = 0 result = None add = False while count < len(aWords): val = None word = aWords[count] wordNext = "" wordPrev = "" if count < (len(aWords) - 1): wordNext = aWords[count + 1] if count > 0: wordPrev = aWords[count - 1] if word in articles_fr: count += 1 continue if word in ["et", "plus", "+"]: count += 1 add = True continue # is current word a numeric number? if word.isdigit(): val = int(word) count += 1 elif is_numeric(word): val = float(word) count += 1 elif wordPrev in articles_fr and getOrdinal_fr(word): val = getOrdinal_fr(word) count += 1 # is current word the denominator of a fraction? elif isFractional_fr(word): val = isFractional_fr(word) count += 1 # is current word the numerator of a fraction? if val and wordNext: valNext = isFractional_fr(wordNext) if valNext: val = float(val) * valNext count += 1 if not val: count += 1 # is current word a numeric fraction like "2/3"? aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) # is current word followed by a decimal value? if wordNext == "virgule": zeros = 0 newWords = aWords[count + 1:] # count the number of zeros after the decimal sign for word in newWords: if word == "zéro" or word == "0": zeros += 1 else: break afterDotVal = None # extract the number after the zeros if newWords[zeros].isdigit(): afterDotVal = newWords[zeros] countDot = count + zeros + 2 # if a number was extracted (since comma is also a # punctuation sign) if afterDotVal: count = countDot if not val: val = 0 # add the zeros afterDotString = zeros * "0" + afterDotVal val = float(str(val) + "." + afterDotString) if val: if add: result += val add = False else: result = val # if result == False: if not result: return normalize_fr(text, True) return result
def extractnumber_da(text): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number undefined articles cannot be suppressed in German: 'ein Pferd' means 'one horse' and 'a horse' """ aWords = text.split() aWords = [word for word in aWords if word not in ["den", "det"]] and_pass = False valPreAnd = False val = False count = 0 while count < len(aWords): word = aWords[count] if is_numeric(word): if word.isdigit(): # doesn't work with decimals val = float(word) elif isFractional_da(word): val = isFractional_da(word) elif isOrdinal_da(word): val = isOrdinal_da(word) else: if word in da_numbers: val = da_numbers[word] if count < (len(aWords) - 1): wordNext = aWords[count + 1] else: wordNext = "" valNext = isFractional_da(wordNext) if valNext: val = val * valNext aWords[count + 1] = "" if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) elif and_pass: # added to value, quit here val = valPreAnd break else: count += 1 continue aWords[count] = "" if and_pass: aWords[count - 1] = '' # remove "og" val += valPreAnd elif count + 1 < len(aWords) and aWords[count + 1] == 'og': and_pass = True valPreAnd = val val = False count += 2 continue elif count + 2 < len(aWords) and aWords[count + 2] == 'og': and_pass = True valPreAnd = val val = False count += 3 continue break if not val: return False return val
def extractnumber_pt(text): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number """ aWords = text.split() count = 0 result = None while count < len(aWords): val = 0 word = aWords[count] next_next_word = None if count + 1 < len(aWords): next_word = aWords[count + 1] if count + 2 < len(aWords): next_next_word = aWords[count + 2] else: next_word = None # is current word a number? if word in pt_numbers: val = pt_numbers[word] elif word.isdigit(): # doesn't work with decimals val = int(word) elif is_numeric(word): val = float(word) elif isFractional_pt(word): if not result: result = 1 result = result * isFractional_pt(word) count += 1 continue if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) if val: if result is None: result = 0 # handle fractions if next_word != "avos": result += val else: result = float(result) / float(val) if next_word is None: break # number word and fraction ands = ["e"] if next_word in ands: zeros = 0 if result is None: count += 1 continue newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " afterAndVal = extractnumber_pt(newText[:-1]) if afterAndVal: if result < afterAndVal or result < 20: while afterAndVal > 1: afterAndVal = afterAndVal / 10.0 for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break for _ in range(0, zeros): afterAndVal = afterAndVal / 10.0 result += afterAndVal break elif next_next_word is not None: if next_next_word in ands: newWords = aWords[count + 3:] newText = "" for word in newWords: newText += word + " " afterAndVal = extractnumber_pt(newText[:-1]) if afterAndVal: if result is None: result = 0 result += afterAndVal break decimals = ["ponto", "virgula", u"v�rgula", ".", ","] if next_word in decimals: zeros = 0 newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break afterDotVal = str(extractnumber_pt(newText[:-1])) afterDotVal = zeros * "0" + afterDotVal result = float(str(result) + "." + afterDotVal) break count += 1 if result is None: return False # Return the $str with the number related words removed # (now empty strings, so strlen == 0) # aWords = [word for word in aWords if len(word) > 0] # text = ' '.join(aWords) if "." in str(result): integer, dec = str(result).split(".") # cast float to int if dec == "0": result = int(integer) return result
def extractnumber_es(text): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number """ aWords = text.split() count = 0 result = None while count < len(aWords): val = 0 word = aWords[count] next_next_word = None if count + 1 < len(aWords): next_word = aWords[count + 1] if count + 2 < len(aWords): next_next_word = aWords[count + 2] else: next_word = None # is current word a number? if word in es_numbers: val = es_numbers[word] elif word.isdigit(): # doesn't work with decimals val = int(word) elif is_numeric(word): val = float(word) elif isFractional_es(word): if not result: result = 1 result = result * isFractional_es(word) count += 1 continue if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) if val: if result is None: result = 0 # handle fractions if next_word != "avos": result += val else: result = float(result) / float(val) if next_word is None: break # number word and fraction ands = ["e"] if next_word in ands: zeros = 0 if result is None: count += 1 continue newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " afterAndVal = extractnumber_es(newText[:-1]) if afterAndVal: if result < afterAndVal or result < 20: while afterAndVal > 1: afterAndVal = afterAndVal / 10.0 for word in newWords: if word == "cero" or word == "0": zeros += 1 else: break for _ in range(0, zeros): afterAndVal = afterAndVal / 10.0 result += afterAndVal break elif next_next_word is not None: if next_next_word in ands: newWords = aWords[count + 3:] newText = "" for word in newWords: newText += word + " " afterAndVal = extractnumber_es(newText[:-1]) if afterAndVal: if result is None: result = 0 result += afterAndVal break decimals = ["punto", "coma", ".", ","] if next_word in decimals: zeros = 0 newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " for word in newWords: if word == "cero" or word == "0": zeros += 1 else: break afterDotVal = str(extractnumber_es(newText[:-1])) afterDotVal = zeros * "0" + afterDotVal result = float(str(result) + "." + afterDotVal) break count += 1 if result is None: return False # Return the $str with the number related words removed # (now empty strings, so strlen == 0) # aWords = [word for word in aWords if len(word) > 0] # text = ' '.join(aWords) if "." in str(result): integer, dec = str(result).split(".") # cast float to int if dec == "0": result = int(integer) return result
def extractnumber_en(text): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number """ aWords = text.split() aWords = [word for word in aWords if word not in ["the", "a", "an"]] and_pass = False valPreAnd = False val = False count = 0 while count < len(aWords): word = aWords[count] if is_numeric(word): # if word.isdigit(): # doesn't work with decimals val = float(word) elif word == "first": val = 1 elif word == "second": val = 2 elif isFractional_en(word): val = isFractional_en(word) else: if word == "one": val = 1 elif word == "two": val = 2 elif word == "three": val = 3 elif word == "four": val = 4 elif word == "five": val = 5 elif word == "six": val = 6 elif word == "seven": val = 7 elif word == "eight": val = 8 elif word == "nine": val = 9 elif word == "ten": val = 10 if val: if count < (len(aWords) - 1): wordNext = aWords[count + 1] else: wordNext = "" valNext = isFractional_en(wordNext) if valNext: val = val * valNext aWords[count + 1] = "" # if val == False: if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) elif and_pass: # added to value, quit here val = valPreAnd break else: count += 1 continue aWords[count] = "" if and_pass: aWords[count - 1] = '' # remove "and" val += valPreAnd elif count + 1 < len(aWords) and aWords[count + 1] == 'and': and_pass = True valPreAnd = val val = False count += 2 continue elif count + 2 < len(aWords) and aWords[count + 2] == 'and': and_pass = True valPreAnd = val val = False count += 3 continue break # if val == False: if not val: return False # Return the string with the number related words removed # (now empty strings, so strlen == 0) aWords = [word for word in aWords if len(word) > 0] text = ' '.join(aWords) return val
def extractnumber_en(text, short_scale=True, ordinals=False): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers Args: text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 Returns: (int) or (float) or False: The extracted number or False if no number was found """ string_num_en = { "half": 0.5, "halves": 0.5, "couple": 2, "hundred": 100, "hundreds": 100, "thousand": 1000, "thousands": 1000, "million": 1000000, 'millions': 1000000 } string_num_ordinal_en = {} for num in NUM_STRING_EN: num_string = NUM_STRING_EN[num] string_num_en[num_string] = num # first, second... if ordinals: if short_scale: for num in SHORT_ORDINAL_STRING_EN: num_string = SHORT_ORDINAL_STRING_EN[num] string_num_ordinal_en[num_string] = num string_num_en[num_string] = num else: for num in LONG_ORDINAL_STRING_EN: num_string = LONG_ORDINAL_STRING_EN[num] string_num_ordinal_en[num_string] = num string_num_en[num_string] = num # negate next number (-2 = 0 - 2) negatives = ["negative", "minus"] # sum the next number (twenty two = 20 + 2) sums = [ 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety' ] # multiply the previous number (one hundred = 1 * 100) multiplies = [ "hundred", "thousand", "hundreds", "thousands", "million", "millions" ] # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) fraction_marker = [" and "] # decimal marker ( 1 point 5 = 1 + 0.5) decimal_marker = [" point ", " dot "] if short_scale: for num in SHORT_SCALE_EN: num_string = SHORT_SCALE_EN[num] string_num_en[num_string] = num string_num_en[num_string + "s"] = num multiplies.append(num_string) multiplies.append(num_string + "s") else: for num in LONG_SCALE_EN: num_string = LONG_SCALE_EN[num] string_num_en[num_string] = num string_num_en[num_string + "s"] = num multiplies.append(num_string) multiplies.append(num_string + "s") # 2 and 3/4 for c in fraction_marker: components = text.split(c) if len(components) == 2: # ensure first is not a fraction and second is a fraction num1 = extractnumber_en(components[0]) num2 = extractnumber_en(components[1]) if num1 is not None and num2 is not None \ and num1 >= 1 and 0 < num2 < 1: return num1 + num2 # 2 point 5 for c in decimal_marker: components = text.split(c) if len(components) == 2: number = extractnumber_en(components[0]) decimal = extractnumber_en(components[1]) if number is not None and decimal is not None: # TODO handle number dot number number number if "." not in str(decimal): return number + float("0." + str(decimal)) aWords = text.split() aWords = [word for word in aWords if word not in ["the", "a", "an"]] val = False prev_val = None to_sum = [] for idx, word in enumerate(aWords): if not word: continue prev_word = aWords[idx - 1] if idx > 0 else "" next_word = aWords[idx + 1] if idx + 1 < len(aWords) else "" # is this word already a number ? if is_numeric(word): # if word.isdigit(): # doesn't work with decimals val = float(word) # is this word the name of a number ? if word in string_num_en: val = string_num_en[word] # is the prev word an ordinal number and current word is one? # second one, third one if ordinals and prev_word in string_num_ordinal_en and val is 1: val = prev_val # is the prev word a number and should we sum it? # twenty two, fifty six if prev_word in sums and word in string_num_en: if val and val < 10: val = prev_val + val # is the prev word a number and should we multiply it? # twenty hundred, six hundred if word in multiplies: if not prev_val: prev_val = 1 val = prev_val * val # is this a spoken fraction? # half cup if val is False: val = isFractional_en(word, short_scale=short_scale) # 2 fifths if not ordinals: next_value = isFractional_en(next_word, short_scale=short_scale) if next_value: if not val: val = 1 val = val * next_value # is this a negative number? if val and prev_word and prev_word in negatives: val = 0 - val # let's make sure it isn't a fraction if not val: # look for fractions like "2/3" aPieces = word.split('/') if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) else: prev_val = val # handle long numbers # six hundred sixty six # two million five hundred thousand if word in multiplies and next_word not in multiplies: to_sum.append(val) val = 0 prev_val = 0 if val is not None: for v in to_sum: val = val + v return val
def extractnumber_it(text, short_scale=False, ordinals=False): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers Args: text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 Returns: (int) or (float) or False: The extracted number or False if no number was found """ string_num_ordinal_it = {} # first, second... if ordinals: if short_scale: for num in SHORT_ORDINAL_STRING_IT: num_string = SHORT_ORDINAL_STRING_IT[num] string_num_ordinal_it[num_string] = num STRING_NUM_ITA[num_string] = num else: for num in LONG_ORDINAL_STRING_IT: num_string = LONG_ORDINAL_STRING_IT[num] string_num_ordinal_it[num_string] = num STRING_NUM_ITA[num_string] = num # negate next number (-2 = 0 - 2) negatives = ['meno'] # 'negativo' non è usuale in italiano # multiply the previous number (one hundred = 1 * 100) multiplies = [ 'decina', 'decine', 'dozzina', 'dozzine', 'centinaia', 'centinaio', 'migliaia', 'migliaio', 'mila' ] # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) fraction_marker = [' e '] # decimal marker ( 1 point 5 = 1 + 0.5) decimal_marker = [' punto ', ' virgola '] if short_scale: for num in SHORT_SCALE_IT: num_string = SHORT_SCALE_IT[num] STRING_NUM_ITA[num_string] = num multiplies.append(num_string) else: for num in LONG_SCALE_IT: num_string = LONG_SCALE_IT[num] STRING_NUM_ITA[num_string] = num multiplies.append(num_string) # 2 e 3/4 ed altri casi for separator in fraction_marker: components = text.split(separator) zeros = 0 if len(components) == 2: # count zeros in fraction part sub_components = components[1].split(' ') for element in sub_components: if element == 'zero' or element == '0': zeros += 1 else: break # ensure first is not a fraction and second is a fraction num1 = extractnumber_it(components[0]) num2 = extractnumber_it(components[1]) if num1 is not None and num2 is not None \ and num1 >= 1 and 0 < num2 < 1: return num1 + num2 # sette e quaranta sette e zero zero due elif num1 is not None and num2 is not None \ and num1 >= 1 and num2 > 1: return num1 + num2 / pow(10, len(str(num2)) + zeros) # 2 punto 5 for separator in decimal_marker: zeros = 0 # count zeros in fraction part components = text.split(separator) if len(components) == 2: sub_components = components[1].split(' ') for element in sub_components: if element == 'zero' or element == '0': zeros += 1 else: break number = int(extractnumber_it(components[0])) decimal = int(extractnumber_it(components[1])) if number is not None and decimal is not None: if '.' not in str(decimal): return number + decimal / pow(10, len(str(decimal)) + zeros) all_words = text.split() val = False prev_val = None to_sum = [] for idx, word in enumerate(all_words): if not word: continue prev_word = all_words[idx - 1] if idx > 0 else '' next_word = all_words[idx + 1] if idx + 1 < len(all_words) else '' # is this word already a number ? if is_numeric(word): val = float(word) # is this word the name of a number ? if word in STRING_NUM_ITA: val = STRING_NUM_ITA[word] # tre quarti un quarto trenta secondi if isFractional_it(word) and prev_val: if word[:-1] == 'second' and not ordinals: val = prev_val * 2 else: val = prev_val # is the prev word a number and should we multiply it? # twenty hundred, six hundred if word in multiplies: if not prev_val: prev_val = 1 val = prev_val * val # is this a spoken fraction? # mezza tazza if val is False: val = isFractional_it(word, short_scale=short_scale) # 2 quinti if not ordinals: next_value = isFractional_it(next_word, short_scale=short_scale) if next_value: if not val: val = 1 val = val * next_value # is this a negative number? if val and prev_word and prev_word in negatives: val = 0 - val if not val: val = extractnumber_long_it(word) # let's make sure it isn't a fraction if not val: # look for fractions like '2/3' all_pieces = word.split('/') if look_for_fractions(all_pieces): val = float(all_pieces[0]) / float(all_pieces[1]) else: prev_val = val # handle long numbers # six hundred sixty six # two million five hundred thousand if word in multiplies and next_word not in multiplies: to_sum.append(val) val = 0 prev_val = 0 elif extractnumber_long_it(word) > 100 and \ extractnumber_long_it(next_word) and \ next_word not in multiplies: to_sum.append(val) val = 0 prev_val = 0 if val is not None: for addend in to_sum: val = val + addend return val
def extractnumber_it(text, short_scale=False, ordinals=False): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers Args: text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 Returns: (int) or (float) or False: The extracted number or False if no number was found """ string_num_ordinal_it = {} # first, second... if ordinals: if short_scale: for num in SHORT_ORDINAL_STRING_IT: num_string = SHORT_ORDINAL_STRING_IT[num] string_num_ordinal_it[num_string] = num STRING_NUM_ITA[num_string] = num else: for num in LONG_ORDINAL_STRING_IT: num_string = LONG_ORDINAL_STRING_IT[num] string_num_ordinal_it[num_string] = num STRING_NUM_ITA[num_string] = num # negate next number (-2 = 0 - 2) negatives = ['meno'] # 'negativo' non è usuale in italiano # multiply the previous number (one hundred = 1 * 100) multiplies = ['decina', 'decine', 'dozzina', 'dozzine', 'centinaia', 'centinaio', 'migliaia', 'migliaio', 'mila'] # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) fraction_marker = [' e '] # decimal marker ( 1 point 5 = 1 + 0.5) decimal_marker = [' punto ', ' virgola '] if short_scale: for num in SHORT_SCALE_IT: num_string = SHORT_SCALE_IT[num] STRING_NUM_ITA[num_string] = num multiplies.append(num_string) else: for num in LONG_SCALE_IT: num_string = LONG_SCALE_IT[num] STRING_NUM_ITA[num_string] = num multiplies.append(num_string) # 2 e 3/4 ed altri casi for separator in fraction_marker: components = text.split(separator) zeros = 0 if len(components) == 2: # count zeros in fraction part sub_components = components[1].split(' ') for element in sub_components: if element == 'zero' or element == '0': zeros += 1 else: break # ensure first is not a fraction and second is a fraction num1 = extractnumber_it(components[0]) num2 = extractnumber_it(components[1]) if num1 is not None and num2 is not None \ and num1 >= 1 and 0 < num2 < 1: return num1 + num2 # sette e quaranta sette e zero zero due elif num1 is not None and num2 is not None \ and num1 >= 1 and num2 > 1: return num1 + num2 / pow(10, len(str(num2)) + zeros) # 2 punto 5 for separator in decimal_marker: zeros = 0 # count zeros in fraction part components = text.split(separator) if len(components) == 2: sub_components = components[1].split(' ') for element in sub_components: if element == 'zero' or element == '0': zeros += 1 else: break number = int(extractnumber_it(components[0])) decimal = int(extractnumber_it(components[1])) if number is not None and decimal is not None: if '.' not in str(decimal): return number + decimal / pow(10, len(str(decimal)) + zeros) all_words = text.split() val = False prev_val = None to_sum = [] for idx, word in enumerate(all_words): if not word: continue prev_word = all_words[idx - 1] if idx > 0 else '' next_word = all_words[idx + 1] if idx + 1 < len(all_words) else '' # is this word already a number ? if is_numeric(word): val = float(word) # is this word the name of a number ? if word in STRING_NUM_ITA: val = STRING_NUM_ITA[word] # tre quarti un quarto trenta secondi if isFractional_it(word) and prev_val: if word[:-1] == 'second' and not ordinals: val = prev_val * 2 else: val = prev_val # is the prev word a number and should we multiply it? # twenty hundred, six hundred if word in multiplies: if not prev_val: prev_val = 1 val = prev_val * val # is this a spoken fraction? # mezza tazza if val is False: val = isFractional_it(word, short_scale=short_scale) # 2 quinti if not ordinals: next_value = isFractional_it(next_word, short_scale=short_scale) if next_value: if not val: val = 1 val = val * next_value # is this a negative number? if val and prev_word and prev_word in negatives: val = 0 - val if not val: val = extractnumber_long_it(word) # let's make sure it isn't a fraction if not val: # look for fractions like '2/3' all_pieces = word.split('/') if look_for_fractions(all_pieces): val = float(all_pieces[0]) / float(all_pieces[1]) else: prev_val = val # handle long numbers # six hundred sixty six # two million five hundred thousand if word in multiplies and next_word not in multiplies: to_sum.append(val) val = 0 prev_val = 0 elif extractnumber_long_it(word) > 100 and \ extractnumber_long_it(next_word) and \ next_word not in multiplies: to_sum.append(val) val = 0 prev_val = 0 if val is not None: for addend in to_sum: val = val + addend return val
def _extract_whole_number_with_text_en(tokens, short_scale, ordinals): """ Handle numbers not handled by the decimal or fraction functions. This is generally whole numbers. Note that phrases such as "one half" will be handled by this function, while "one and a half" are handled by the fraction function. Args: tokens [_Token]: short_scale boolean: ordinals boolean: Returns: int or float, [_Tokens] The value parsed, and tokens that it corresponds to. """ multiplies, string_num_ordinal, string_num_scale = \ _initialize_number_data(short_scale) number_words = [] # type: [_Token] val = False prev_val = None next_val = None to_sum = [] for idx, token in enumerate(tokens): current_val = None if next_val: next_val = None continue word = token.word if word in _ARTICLES or word in _NEGATIVES: number_words.append(token) continue prev_word = tokens[idx - 1].word if idx > 0 else "" next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else "" if word not in string_num_scale and \ word not in _STRING_NUM_EN and \ word not in _SUMS and \ word not in multiplies and \ not (ordinals and word in string_num_ordinal) and \ not is_numeric(word) and \ not isFractional_en(word, short_scale=short_scale) and \ not look_for_fractions(word.split('/')): words_only = [token.word for token in number_words] if number_words and not all( [w in _ARTICLES | _NEGATIVES for w in words_only]): break else: number_words = [] continue elif word not in multiplies \ and prev_word not in multiplies \ and prev_word not in _SUMS \ and not (ordinals and prev_word in string_num_ordinal) \ and prev_word not in _NEGATIVES \ and prev_word not in _ARTICLES: number_words = [token] elif prev_word in _SUMS and word in _SUMS: number_words = [token] else: number_words.append(token) # is this word already a number ? if is_numeric(word): if word.isdigit(): # doesn't work with decimals val = int(word) else: val = float(word) current_val = val # is this word the name of a number ? if word in _STRING_NUM_EN: val = _STRING_NUM_EN.get(word) current_val = val elif word in string_num_scale: val = string_num_scale.get(word) current_val = val elif ordinals and word in string_num_ordinal: val = string_num_ordinal[word] current_val = val # is the prev word an ordinal number and current word is one? # second one, third one if ordinals and prev_word in string_num_ordinal and val is 1: val = prev_val # is the prev word a number and should we sum it? # twenty two, fifty six if prev_word in _SUMS and val and val < 10: val = prev_val + val # is the prev word a number and should we multiply it? # twenty hundred, six hundred if word in multiplies: if not prev_val: prev_val = 1 val = prev_val * val # is this a spoken fraction? # half cup if val is False: val = isFractional_en(word, short_scale=short_scale) current_val = val # 2 fifths if not ordinals: next_val = isFractional_en(next_word, short_scale=short_scale) if next_val: if not val: val = 1 val = val * next_val number_words.append(tokens[idx + 1]) # is this a negative number? if val and prev_word and prev_word in _NEGATIVES: val = 0 - val # let's make sure it isn't a fraction if not val: # look for fractions like "2/3" aPieces = word.split('/') if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) current_val = val else: if prev_word in _SUMS and word not in _SUMS and current_val >= 10: # Backtrack - we've got numbers we can't sum. number_words.pop() val = prev_val break prev_val = val # handle long numbers # six hundred sixty six # two million five hundred thousand if word in multiplies and next_word not in multiplies: to_sum.append(val) val = 0 prev_val = 0 if val is not None and to_sum: val += sum(to_sum) return val, number_words