def extractnumber_de(text, short_scale=True, ordinals=False): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number undefined articles cannot be suppressed in German: 'ein Pferd' means 'one horse' and 'a horse' """ aWords = text.split() aWords = [word for word in aWords if word not in ["der", "die", "das", "des", "den", "dem"]] and_pass = False valPreAnd = False val = False count = 0 while count < len(aWords): word = aWords[count] if is_numeric(word): # if word.isdigit(): # doesn't work with decimals val = float(word) elif isFractional_de(word): val = isFractional_de(word) elif isOrdinal_de(word): val = isOrdinal_de(word) else: if word in de_numbers: val = de_numbers[word] if count < (len(aWords) - 1): wordNext = aWords[count + 1] else: wordNext = "" valNext = isFractional_de(wordNext) if valNext: val = val * valNext aWords[count + 1] = "" if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) elif and_pass: # added to value, quit here val = valPreAnd break else: count += 1 continue aWords[count] = "" if and_pass: aWords[count - 1] = '' # remove "and" val += valPreAnd elif count + 1 < len(aWords) and aWords[count + 1] == 'und': and_pass = True valPreAnd = val val = False count += 2 continue elif count + 2 < len(aWords) and aWords[count + 2] == 'und': and_pass = True valPreAnd = val val = False count += 3 continue break return val or False
def extractnumber_fr(text): """Takes in a string and extracts a number. Args: text (str): the string to extract a number from Returns: (str): The number extracted or the original text. """ # normalize text, keep articles for ordinals versus fractionals text = normalize_fr(text, False) # split words by whitespace aWords = text.split() count = 0 result = None add = False while count < len(aWords): val = None word = aWords[count] wordNext = "" wordPrev = "" if count < (len(aWords) - 1): wordNext = aWords[count + 1] if count > 0: wordPrev = aWords[count - 1] if word in articles_fr: count += 1 continue if word in ["et", "plus", "+"]: count += 1 add = True continue # is current word a numeric number? if word.isdigit(): val = int(word) count += 1 elif is_numeric(word): val = float(word) count += 1 elif wordPrev in articles_fr and getOrdinal_fr(word): val = getOrdinal_fr(word) count += 1 # is current word the denominator of a fraction? elif isFractional_fr(word): val = isFractional_fr(word) count += 1 # is current word the numerator of a fraction? if val and wordNext: valNext = isFractional_fr(wordNext) if valNext: val = float(val) * valNext count += 1 if not val: count += 1 # is current word a numeric fraction like "2/3"? aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) # is current word followed by a decimal value? if wordNext == "virgule": zeros = 0 newWords = aWords[count + 1:] # count the number of zeros after the decimal sign for word in newWords: if word == "zéro" or word == "0": zeros += 1 else: break afterDotVal = None # extract the number after the zeros if newWords[zeros].isdigit(): afterDotVal = newWords[zeros] countDot = count + zeros + 2 # if a number was extracted (since comma is also a # punctuation sign) if afterDotVal: count = countDot if not val: val = 0 # add the zeros afterDotString = zeros * "0" + afterDotVal val = float(str(val) + "." + afterDotString) if val: if add: result += val add = False else: result = val # if result == False: if not result: return normalize_fr(text, True) return result
def extract_datetime_de(string, currentDate, default_time): def clean_string(s): """ cleans the input string of unneeded punctuation and capitalization among other things. 'am' is a preposition, so cannot currently be used for 12 hour date format """ s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ .replace(' der ', ' ').replace(' den ', ' ').replace(' an ', ' ').replace( ' am ', ' ') \ .replace(' auf ', ' ').replace(' um ', ' ') wordList = s.split() for idx, word in enumerate(wordList): if isOrdinal_de(word) is not False: word = str(isOrdinal_de(word)) wordList[idx] = word return wordList def date_found(): return found or \ ( datestr != "" or timeStr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset is True or hrOffset != 0 or hrAbs or minOffset != 0 or minAbs or secOffset != 0 ) if string == "" or not currentDate: return None found = False daySpecified = False dayOffset = False monthOffset = 0 yearOffset = 0 dateNow = currentDate today = dateNow.strftime("%w") currentYear = dateNow.strftime("%Y") fromFlag = False datestr = "" hasYear = False timeQualifier = "" timeQualifiersList = ['früh', 'morgens', 'vormittag', 'vormittags', 'nachmittag', 'nachmittags', 'abend', 'abends', 'nachts'] markers = ['in', 'am', 'gegen', 'bis', 'für'] days = ['montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag', 'samstag', 'sonntag'] months = ['januar', 'februar', 'märz', 'april', 'mai', 'juni', 'juli', 'august', 'september', 'october', 'november', 'dezember'] monthsShort = ['jan', 'feb', 'mär', 'apr', 'mai', 'juni', 'juli', 'aug', 'sept', 'oct', 'nov', 'dez'] validFollowups = days + months + monthsShort validFollowups.append("heute") validFollowups.append("morgen") validFollowups.append("nächste") validFollowups.append("nächster") validFollowups.append("nächstes") validFollowups.append("nächsten") validFollowups.append("nächstem") validFollowups.append("letzte") validFollowups.append("letzter") validFollowups.append("letztes") validFollowups.append("letzten") validFollowups.append("letztem") validFollowups.append("jetzt") words = clean_string(string) for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" # this isn't in clean string because I don't want to save back to words if word != 'morgen' and word != 'übermorgen': if word[-2:] == "en": word = word[:-2] # remove en if word != 'heute': if word[-1:] == "e": word = word[:-1] # remove plural for most nouns start = idx used = 0 # save timequalifier for later if word in timeQualifiersList: timeQualifier = word # parse today, tomorrow, day after tomorrow elif word == "heute" and not fromFlag: dayOffset = 0 used += 1 elif word == "morgen" and not fromFlag and wordPrev != "am" and \ wordPrev not in days: # morgen means tomorrow if not "am # Morgen" and not [day of the week] morgen dayOffset = 1 used += 1 elif word == "übermorgen" and not fromFlag: dayOffset = 2 used += 1 # parse 5 days, 10 weeks, last week, next week elif word == "tag" or word == "tage": if wordPrev[0].isdigit(): dayOffset += int(wordPrev) start -= 1 used = 2 elif word == "woch" and not fromFlag: if wordPrev[0].isdigit(): dayOffset += int(wordPrev) * 7 start -= 1 used = 2 elif wordPrev[:6] == "nächst": dayOffset = 7 start -= 1 used = 2 elif wordPrev[:5] == "letzt": dayOffset = -7 start -= 1 used = 2 # parse 10 months, next month, last month elif word == "monat" and not fromFlag: if wordPrev[0].isdigit(): monthOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev[:6] == "nächst": monthOffset = 1 start -= 1 used = 2 elif wordPrev[:5] == "letzt": monthOffset = -1 start -= 1 used = 2 # parse 5 years, next year, last year elif word == "jahr" and not fromFlag: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev[:6] == "nächst": yearOffset = 1 start -= 1 used = 2 elif wordPrev[:6] == "nächst": yearOffset = -1 start -= 1 used = 2 # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not fromFlag: d = days.index(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 if wordNext == "morgen": # morgen means morning if preceded by # the day of the week words[idx + 1] = "früh" if wordPrev[:6] == "nächst": dayOffset += 7 used += 1 start -= 1 elif wordPrev[:5] == "letzt": dayOffset -= 7 used += 1 start -= 1 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in months or word in monthsShort and not fromFlag: try: m = months.index(word) except ValueError: m = monthsShort.index(word) used += 1 datestr = months[m] if wordPrev and (wordPrev[0].isdigit() or (wordPrev == "of" and wordPrevPrev[0].isdigit())): if wordPrev == "of" and wordPrevPrev[0].isdigit(): datestr += " " + words[idx - 2] used += 1 start -= 1 else: datestr += " " + wordPrev start -= 1 used += 1 if wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 if wordNextNext and wordNextNext[0].isdigit(): datestr += " " + wordNextNext used += 1 hasYear = True else: hasYear = False # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July if ( word == "von" or word == "nach" or word == "ab") and wordNext \ in validFollowups: used = 2 fromFlag = True if wordNext == "morgen" and wordPrev != "am" and \ wordPrev not in days: # morgen means tomorrow if not "am # Morgen" and not [day of the week] morgen: dayOffset += 1 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 if tmpOffset < 0: tmpOffset += 7 dayOffset += tmpOffset elif wordNextNext and wordNextNext in days: d = days.index(wordNextNext) tmpOffset = (d + 1) - int(today) used = 3 if wordNext[:6] == "nächst": tmpOffset += 7 used += 1 start -= 1 elif wordNext[:5] == "letzt": tmpOffset -= 7 used += 1 start -= 1 dayOffset += tmpOffset if used > 0: if start - 1 > 0 and words[start - 1].startswith("diese"): start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in markers: words[start - 1] = "" found = True daySpecified = True # parse time timeStr = "" hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = None minAbs = None for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" wordNextNextNextNext = words[idx + 4] if idx + 4 < len(words) else "" # parse noon, midnight, morning, afternoon, evening used = 0 if word[:6] == "mittag": hrAbs = 12 used += 1 elif word[:11] == "mitternacht": hrAbs = 0 used += 1 elif word == "morgens" or ( wordPrev == "am" and word == "morgen") or word == "früh": if not hrAbs: hrAbs = 8 used += 1 elif word[:10] == "nachmittag": if not hrAbs: hrAbs = 15 used += 1 elif word[:5] == "abend": if not hrAbs: hrAbs = 19 used += 1 # parse half an hour, quarter hour elif word == "stunde" and \ (wordPrev in markers or wordPrevPrev in markers): if wordPrev[:4] == "halb": minOffset = 30 elif wordPrev == "viertel": minOffset = 15 elif wordPrev == "dreiviertel": minOffset = 45 else: hrOffset = 1 if wordPrevPrev in markers: words[idx - 2] = "" words[idx - 1] = "" used += 1 hrAbs = -1 minAbs = -1 # parse 5:00 am, 12:00 p.m., etc elif word[0].isdigit(): isTime = True strHH = "" strMM = "" remainder = "" if ':' in word: # parse colons # "3:00 in the morning" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): strHH += word[i] elif word[i] == ":": stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): strMM += word[i] else: stage = 2 i -= 1 elif stage == 2: remainder = word[i:].replace(".", "") break if remainder == "": nextWord = wordNext.replace(".", "") if nextWord == "am" or nextWord == "pm": remainder = nextWord used += 1 elif nextWord == "abends": remainder = "pm" used += 1 elif wordNext == "am" and wordNextNext == "morgen": remainder = "am" used += 2 elif wordNext == "am" and wordNextNext == "nachmittag": remainder = "pm" used += 2 elif wordNext == "am" and wordNextNext == "abend": remainder = "pm" used += 2 elif wordNext == "morgens": remainder = "am" used += 1 elif wordNext == "nachmittags": remainder = "pm" used += 1 elif wordNext == "abends": remainder = "pm" used += 1 elif wordNext == "heute" and wordNextNext == "morgen": remainder = "am" used = 2 elif wordNext == "heute" and wordNextNext == "nachmittag": remainder = "pm" used = 2 elif wordNext == "heute" and wordNextNext == "abend": remainder = "pm" used = 2 elif wordNext == "nachts": if strHH > 4: remainder = "pm" else: remainder = "am" used += 1 else: if timeQualifier != "": if strHH <= 12 and \ (timeQualifier == "abends" or timeQualifier == "nachmittags"): strHH += 12 # what happens when strHH is 24? else: # try to parse # s without colons # 5 hours, 10 minutes etc. length = len(word) strNum = "" remainder = "" for i in range(length): if word[i].isdigit(): strNum += word[i] else: remainder += word[i] if remainder == "": remainder = wordNext.replace(".", "").lstrip().rstrip() if ( remainder == "pm" or wordNext == "pm" or remainder == "p.m." or wordNext == "p.m."): strHH = strNum remainder = "pm" used = 1 elif ( remainder == "am" or wordNext == "am" or remainder == "a.m." or wordNext == "a.m."): strHH = strNum remainder = "am" used = 1 else: if wordNext == "stund" and int(word) < 100: # "in 3 hours" hrOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "minut": # "in 10 minutes" minOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "sekund": # in 5 seconds secOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "uhr": strHH = word used += 1 isTime = True if wordNextNext == timeQualifier: strMM = "" if wordNextNext[:10] == "nachmittag": used += 1 remainder = "pm" elif wordNextNext == "am" and wordNextNextNext == \ "nachmittag": used += 2 remainder = "pm" elif wordNextNext[:5] == "abend": used += 1 remainder = "pm" elif wordNextNext == "am" and wordNextNextNext == \ "abend": used += 2 remainder = "pm" elif wordNextNext[:7] == "morgens": used += 1 remainder = "am" elif wordNextNext == "am" and wordNextNextNext == \ "morgen": used += 2 remainder = "am" elif wordNextNext == "nachts": used += 1 if 8 <= int(word) <= 12: remainder = "pm" else: remainder = "am" elif is_numeric(wordNextNext): strMM = wordNextNext used += 1 if wordNextNextNext == timeQualifier: if wordNextNextNext[:10] == "nachmittag": used += 1 remainder = "pm" elif wordNextNextNext == "am" and \ wordNextNextNextNext == "nachmittag": used += 2 remainder = "pm" elif wordNextNextNext[:5] == "abend": used += 1 remainder = "pm" elif wordNextNextNext == "am" and \ wordNextNextNextNext == "abend": used += 2 remainder = "pm" elif wordNextNextNext[:7] == "morgens": used += 1 remainder = "am" elif wordNextNextNext == "am" and \ wordNextNextNextNext == "morgen": used += 2 remainder = "am" elif wordNextNextNext == "nachts": used += 1 if 8 <= int(word) <= 12: remainder = "pm" else: remainder = "am" elif wordNext == timeQualifier: strHH = word strMM = 00 isTime = True if wordNext[:10] == "nachmittag": used += 1 remainder = "pm" elif wordNext == "am" and wordNextNext == "nachmittag": used += 2 remainder = "pm" elif wordNext[:5] == "abend": used += 1 remainder = "pm" elif wordNext == "am" and wordNextNext == "abend": used += 2 remainder = "pm" elif wordNext[:7] == "morgens": used += 1 remainder = "am" elif wordNext == "am" and wordNextNext == "morgen": used += 2 remainder = "am" elif wordNext == "nachts": used += 1 if 8 <= int(word) <= 12: remainder = "pm" else: remainder = "am" # if timeQualifier != "": # military = True # else: # isTime = False strHH = int(strHH) if strHH else 0 strMM = int(strMM) if strMM else 0 strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH if strHH > 24 or strMM > 59: isTime = False used = 0 if isTime: hrAbs = strHH * 1 minAbs = strMM * 1 used += 1 if used > 0: # removed parsed words from the sentence for i in range(used): words[idx + i] = "" if wordPrev == "Uhr": words[words.index(wordPrev)] = "" if wordPrev == "früh": hrOffset = -1 words[idx - 1] = "" idx -= 1 elif wordPrev == "spät": hrOffset = 1 words[idx - 1] = "" idx -= 1 if idx > 0 and wordPrev in markers: words[idx - 1] = "" if idx > 1 and wordPrevPrev in markers: words[idx - 2] = "" idx += used - 1 found = True # check that we found a date if not date_found: return None if dayOffset is False: dayOffset = 0 # perform date manipulation extractedDate = dateNow extractedDate = extractedDate.replace(microsecond=0, second=0, minute=0, hour=0) if datestr != "": en_months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec'] for idx, en_month in enumerate(en_months): datestr = datestr.replace(months[idx], en_month) for idx, en_month in enumerate(en_monthsShort): datestr = datestr.replace(monthsShort[idx], en_month) temp = datetime.strptime(datestr, "%B %d") if not hasYear: temp = temp.replace(year=extractedDate.year) if extractedDate < temp: extractedDate = extractedDate.replace(year=int(currentYear), month=int( temp.strftime( "%m")), day=int(temp.strftime( "%d"))) else: extractedDate = extractedDate.replace( year=int(currentYear) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) else: extractedDate = extractedDate.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) if timeStr != "": temp = datetime(timeStr) extractedDate = extractedDate.replace(hour=temp.strftime("%H"), minute=temp.strftime("%M"), second=temp.strftime("%S")) if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) if hrAbs is None and minAbs is None and default_time: hrAbs = default_time.hour minAbs = default_time.minute if hrAbs != -1 and minAbs != -1: extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, minutes=minAbs or 0) if (hrAbs or minAbs) and datestr == "": if not daySpecified and dateNow > extractedDate: extractedDate = extractedDate + relativedelta(days=1) if hrOffset != 0: extractedDate = extractedDate + relativedelta(hours=hrOffset) if minOffset != 0: extractedDate = extractedDate + relativedelta(minutes=minOffset) if secOffset != 0: extractedDate = extractedDate + relativedelta(seconds=secOffset) for idx, word in enumerate(words): if words[idx] == "und" and words[idx - 1] == "" \ and words[idx + 1] == "": words[idx] = "" resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) return [extractedDate, resultStr]
def extractnumber_it(text, short_scale=False, ordinals=False): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers Args: text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 Returns: (int) or (float) or False: The extracted number or False if no number was found """ string_num_ordinal_it = {} # first, second... if ordinals: if short_scale: for num in SHORT_ORDINAL_STRING_IT: num_string = SHORT_ORDINAL_STRING_IT[num] string_num_ordinal_it[num_string] = num STRING_NUM_ITA[num_string] = num else: for num in LONG_ORDINAL_STRING_IT: num_string = LONG_ORDINAL_STRING_IT[num] string_num_ordinal_it[num_string] = num STRING_NUM_ITA[num_string] = num # negate next number (-2 = 0 - 2) negatives = ['meno'] # 'negativo' non è usuale in italiano # multiply the previous number (one hundred = 1 * 100) multiplies = ['decina', 'decine', 'dozzina', 'dozzine', 'centinaia', 'centinaio', 'migliaia', 'migliaio', 'mila'] # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) fraction_marker = [' e '] # decimal marker ( 1 point 5 = 1 + 0.5) decimal_marker = [' punto ', ' virgola '] if short_scale: for num in SHORT_SCALE_IT: num_string = SHORT_SCALE_IT[num] STRING_NUM_ITA[num_string] = num multiplies.append(num_string) else: for num in LONG_SCALE_IT: num_string = LONG_SCALE_IT[num] STRING_NUM_ITA[num_string] = num multiplies.append(num_string) # 2 e 3/4 ed altri casi for separator in fraction_marker: components = text.split(separator) zeros = 0 if len(components) == 2: # count zeros in fraction part sub_components = components[1].split(' ') for element in sub_components: if element == 'zero' or element == '0': zeros += 1 else: break # ensure first is not a fraction and second is a fraction num1 = extractnumber_it(components[0]) num2 = extractnumber_it(components[1]) if num1 is not None and num2 is not None \ and num1 >= 1 and 0 < num2 < 1: return num1 + num2 # sette e quaranta sette e zero zero due elif num1 is not None and num2 is not None \ and num1 >= 1 and num2 > 1: return num1 + num2 / pow(10, len(str(num2)) + zeros) # 2 punto 5 for separator in decimal_marker: zeros = 0 # count zeros in fraction part components = text.split(separator) if len(components) == 2: sub_components = components[1].split(' ') for element in sub_components: if element == 'zero' or element == '0': zeros += 1 else: break number = int(extractnumber_it(components[0])) decimal = int(extractnumber_it(components[1])) if number is not None and decimal is not None: if '.' not in str(decimal): return number + decimal / pow(10, len(str(decimal)) + zeros) all_words = text.split() val = False prev_val = None to_sum = [] for idx, word in enumerate(all_words): if not word: continue prev_word = all_words[idx - 1] if idx > 0 else '' next_word = all_words[idx + 1] if idx + 1 < len(all_words) else '' # is this word already a number ? if is_numeric(word): val = float(word) # is this word the name of a number ? if word in STRING_NUM_ITA: val = STRING_NUM_ITA[word] # tre quarti un quarto trenta secondi if isFractional_it(word) and prev_val: if word[:-1] == 'second' and not ordinals: val = prev_val * 2 else: val = prev_val # is the prev word a number and should we multiply it? # twenty hundred, six hundred if word in multiplies: if not prev_val: prev_val = 1 val = prev_val * val # is this a spoken fraction? # mezza tazza if val is False: val = isFractional_it(word, short_scale=short_scale) # 2 quinti if not ordinals: next_value = isFractional_it(next_word, short_scale=short_scale) if next_value: if not val: val = 1 val = val * next_value # is this a negative number? if val and prev_word and prev_word in negatives: val = 0 - val if not val: val = extractnumber_long_it(word) # let's make sure it isn't a fraction if not val: # look for fractions like '2/3' all_pieces = word.split('/') if look_for_fractions(all_pieces): val = float(all_pieces[0]) / float(all_pieces[1]) else: prev_val = val # handle long numbers # six hundred sixty six # two million five hundred thousand if word in multiplies and next_word not in multiplies: to_sum.append(val) val = 0 prev_val = 0 elif extractnumber_long_it(word) > 100 and \ extractnumber_long_it(next_word) and \ next_word not in multiplies: to_sum.append(val) val = 0 prev_val = 0 if val is not None: for addend in to_sum: val = val + addend return val
def extractnumber_pt(text, short_scale=True, ordinals=False): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number """ text = text.lower() aWords = text.split() count = 0 result = None while count < len(aWords): val = 0 word = aWords[count] next_next_word = None if count + 1 < len(aWords): next_word = aWords[count + 1] if count + 2 < len(aWords): next_next_word = aWords[count + 2] else: next_word = None # is current word a number? if word in _NUMBERS_PT: val = _NUMBERS_PT[word] elif word.isdigit(): # doesn't work with decimals val = int(word) elif is_numeric(word): val = float(word) elif isFractional_pt(word): if not result: result = 1 result = result * isFractional_pt(word) count += 1 continue if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) if val: if result is None: result = 0 # handle fractions if next_word != "avos": result += val else: result = float(result) / float(val) if next_word is None: break # number word and fraction ands = ["e"] if next_word in ands: zeros = 0 if result is None: count += 1 continue newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " afterAndVal = extractnumber_pt(newText[:-1]) if afterAndVal: if result < afterAndVal or result < 20: while afterAndVal > 1: afterAndVal = afterAndVal / 10.0 for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break for _ in range(0, zeros): afterAndVal = afterAndVal / 10.0 result += afterAndVal break elif next_next_word is not None: if next_next_word in ands: newWords = aWords[count + 3:] newText = "" for word in newWords: newText += word + " " afterAndVal = extractnumber_pt(newText[:-1]) if afterAndVal: if result is None: result = 0 result += afterAndVal break decimals = ["ponto", "virgula", "vírgula", ".", ","] if next_word in decimals: zeros = 0 newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break afterDotVal = str(extractnumber_pt(newText[:-1])) afterDotVal = zeros * "0" + afterDotVal result = float(str(result) + "." + afterDotVal) break count += 1 # Return the $str with the number related words removed # (now empty strings, so strlen == 0) # aWords = [word for word in aWords if len(word) > 0] # text = ' '.join(aWords) if "." in str(result): integer, dec = str(result).split(".") # cast float to int if dec == "0": result = int(integer) return result or False
def extractnumber_sv(text): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number """ aWords = text.split() and_pass = False valPreAnd = False val = False count = 0 while count < len(aWords): word = aWords[count] if is_numeric(word): val = float(word) elif word == "första": val = 1 elif word == "andra": val = 2 elif word == "tredje": val = 3 elif word == "fjärde": val = 4 elif word == "femte": val = 5 elif word == "sjätte": val = 6 elif is_fractional_sv(word): val = is_fractional_sv(word) else: if word == "en": val = 1 if word == "ett": val = 1 elif word == "två": val = 2 elif word == "tre": val = 3 elif word == "fyra": val = 4 elif word == "fem": val = 5 elif word == "sex": val = 6 elif word == "sju": val = 7 elif word == "åtta": val = 8 elif word == "nio": val = 9 elif word == "tio": val = 10 if val: if count < (len(aWords) - 1): wordNext = aWords[count + 1] else: wordNext = "" valNext = is_fractional_sv(wordNext) if valNext: val = val * valNext aWords[count + 1] = "" if not val: # look for fractions like "2/3" aPieces = word.split('/') if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) elif and_pass: # added to value, quit here val = valPreAnd break else: count += 1 continue aWords[count] = "" if and_pass: aWords[count - 1] = '' # remove "och" val += valPreAnd elif count + 1 < len(aWords) and aWords[count + 1] == 'och': and_pass = True valPreAnd = val val = False count += 2 continue elif count + 2 < len(aWords) and aWords[count + 2] == 'och': and_pass = True valPreAnd = val val = False count += 3 continue break if not val: return False return val
def extract_number_ca(text, short_scale=True, ordinals=False): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number """ # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. text = text.lower() aWords = text.split() count = 0 result = None while count < len(aWords): val = 0 word = aWords[count] next_next_word = None if count + 1 < len(aWords): next_word = aWords[count + 1] if count + 2 < len(aWords): next_next_word = aWords[count + 2] else: next_word = None # is current word a number? if word in _NUMBERS_CA: val = _NUMBERS_CA[word] elif '-' in word: wordparts = word.split('-') # trenta-cinc > 35 if len(wordparts) == 2 and (wordparts[0] in _TENS_CA and wordparts[1] in _AFTER_TENS_CA): val = _TENS_CA[wordparts[0]] + _AFTER_TENS_CA[wordparts[1]] # vint-i-dues > 22 elif len(wordparts) == 3 and wordparts[1] == 'i' and ( wordparts[0] in _TENS_CA and wordparts[2] in _AFTER_TENS_CA): val = _TENS_CA[wordparts[0]] + _AFTER_TENS_CA[wordparts[2]] # quatre-centes > 400 elif len(wordparts) == 2 and (wordparts[0] in _BEFORE_HUNDREDS_CA and wordparts[1] in _HUNDREDS_CA): val = _BEFORE_HUNDREDS_CA[wordparts[0]] * 100 elif word.isdigit(): # doesn't work with decimals val = int(word) elif is_numeric(word): val = float(word) elif is_fractional_ca(word): if not result: result = 1 result = result * is_fractional_ca(word) count += 1 continue if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) if val: if result is None: result = 0 # handle fractions #TODO: caution, review use of "ens" word if next_word != "ens": result += val else: result = float(result) / float(val) if next_word is None: break # number word and fraction ands = ["i"] if next_word in ands: zeros = 0 if result is None: count += 1 continue newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " afterAndVal = extract_number_ca(newText[:-1]) if afterAndVal: if result < afterAndVal or result < 20: while afterAndVal > 1: afterAndVal = afterAndVal / 10.0 for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break for _ in range(0, zeros): afterAndVal = afterAndVal / 10.0 result += afterAndVal break elif next_next_word is not None: if next_next_word in ands: newWords = aWords[count + 3:] newText = "" for word in newWords: newText += word + " " afterAndVal = extract_number_ca(newText[:-1]) if afterAndVal: if result is None: result = 0 result += afterAndVal break decimals = ["coma", "amb", "punt", ".", ","] if next_word in decimals: zeros = 0 newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break afterDotVal = str(extract_number_ca(newText[:-1])) afterDotVal = zeros * "0" + afterDotVal result = float(str(result) + "." + afterDotVal) break count += 1 # Return the $str with the number related words removed # (now empty strings, so strlen == 0) # aWords = [word for word in aWords if len(word) > 0] # text = ' '.join(aWords) if "." in str(result): integer, dec = str(result).split(".") # cast float to int if dec == "0": result = int(integer) return result or False