def _normalize(self): new_dict = {} conflicting_keys = [] for key, value in self._dictionary.items(): normalized = normalize_unicode(key) if key != normalized and normalized in self._dictionary: conflicting_keys.append(key) else: new_dict[normalized] = value for key in conflicting_keys: normalized = normalize_unicode(key) if key in (self.info.get('skip', []) + self.info.get('pertain', [])): new_dict[normalized] = self._dictionary[key] self._dictionary = new_dict
def _generate_simplifications(self, normalize=False): simplifications = [] for simplification in self.info.get('simplifications', []): c_simplification = {} key, value = list(simplification.items())[0] if normalize: key = normalize_unicode(key) if isinstance(value, int): c_simplification[key] = str(value) else: c_simplification[key] = normalize_unicode(value) if normalize else value simplifications.append(c_simplification) return simplifications
def get_date_data(self, date_string, date_formats=None): """ Parse string representing date and/or time in recognizable localized formats. Supports parsing multiple languages and timezones. :param date_string: A string representing date and/or time in a recognizably valid format. :type date_string: str|unicode :param date_formats: A list of format strings using directives as given `here <https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior>`_. The parser applies formats one by one, taking into account the detected languages. :type date_formats: list :return: a dict mapping keys to :mod:`datetime.datetime` object and *period*. For example: {'date_obj': datetime.datetime(2015, 6, 1, 0, 0), 'period': u'day'} :raises: ValueError - Unknown Language .. note:: *Period* values can be a 'day' (default), 'week', 'month', 'year'. *Period* represents the granularity of date parsed from the given string. In the example below, since no day information is present, the day is assumed to be current day ``16`` from *current date* (which is June 16, 2015, at the moment of writing this). Hence, the level of precision is ``month``: >>> DateDataParser().get_date_data(u'March 2015') {'date_obj': datetime.datetime(2015, 3, 16, 0, 0), 'period': u'month'} Similarly, for date strings with no day and month information present, level of precision is ``year`` and day ``16`` and month ``6`` are from *current_date*. >>> DateDataParser().get_date_data(u'2014') {'date_obj': datetime.datetime(2014, 6, 16, 0, 0), 'period': u'year'} Dates with time zone indications or UTC offsets are returned in UTC time unless specified using `Settings`_. >>> DateDataParser().get_date_data(u'23 March 2000, 1:21 PM CET') {'date_obj': datetime.datetime(2000, 3, 23, 14, 21), 'period': 'day'} """ try: date_string = date_string.strip() except AttributeError: raise TypeError('Input type must be str or unicode') if self._settings.NORMALIZE: date_string = normalize_unicode(date_string) date_string = sanitize_date(date_string) for language in self.language_detector.iterate_applicable_languages( date_string, modify=True, settings=self._settings): parsed_date = _DateLanguageParser.parse( language, date_string, date_formats, settings=self._settings) if parsed_date: return parsed_date else: return {'date_obj': None, 'period': 'day'}
def test_dates_parsing_with_normalization(self, date_string, expected): self.given_local_tz_offset(0) self.given_parser(settings={"NORMALIZE": True, "RELATIVE_BASE": datetime(2012, 11, 13)}) self.when_date_is_parsed(normalize_unicode(date_string)) self.then_date_was_parsed_by_date_parser() self.then_period_is("day") self.then_date_obj_exactly_is(expected)
def is_applicable(self, date_string, strip_timezone=False, settings=None): """ Check if the locale is applicable to translate date string. :param date_string: A string representing date and/or time in a recognizably valid format. :type date_string: str :param strip_timezone: If True, timezone is stripped from date string. :type strip_timezone: bool :return: boolean value representing if the locale is applicable for the date string or not. """ if strip_timezone: date_string, _ = pop_tz_offset_from_string(date_string, as_offset=False) date_string = self._translate_numerals(date_string) if settings.NORMALIZE: date_string = normalize_unicode(date_string) date_string = self._simplify(date_string, settings=settings) dictionary = self._get_dictionary(settings) date_tokens = dictionary.split(date_string) return dictionary.are_tokens_valid(date_tokens)
def when_all_languages_are_detected(self, date_strings, modify=False): assert not isinstance(date_strings, six.string_types) for date_string in date_strings: if settings.NORMALIZE: date_string = normalize_unicode(date_string) detected_languages = list(self.parser.iterate_applicable_languages(date_string, modify=modify, settings=settings)) self.detected_languages = detected_languages
def test_dates_parsing_with_normalization(self, date_string, expected): self.given_local_tz_offset(0) self.given_parser(settings={'NORMALIZE': True, 'RELATIVE_BASE': datetime(2012, 11, 13)}) self.when_date_is_parsed(normalize_unicode(date_string)) self.then_date_was_parsed_by_date_parser() self.then_period_is('day') self.then_date_obj_exactly_is(expected)
def test_dates_parsing_with_normalization(self, date_string, expected): self.given_utcnow(datetime(2012, 11, 13)) # Tuesday self.given_local_tz_offset(0) self.given_parser(settings={'NORMALIZE': True}) self.when_date_is_parsed(normalize_unicode(date_string)) self.then_date_was_parsed_by_date_parser() self.then_period_is('day') self.then_date_obj_exactly_is(expected)
def _simplify_split_align(self, original, settings): # TODO: Switch to new split method. original_tokens = self._word_split(original, settings=settings) simplified_tokens = self._word_split(self._simplify( normalize_unicode(original), settings=settings), settings=settings) if len(original_tokens) == len(simplified_tokens): return original_tokens, simplified_tokens elif len(original_tokens) < len(simplified_tokens): add_empty = False for i, token in enumerate(simplified_tokens): if i < len(original_tokens): if token == normalize_unicode(original_tokens[i].lower()): add_empty = False else: if not add_empty: add_empty = True continue else: original_tokens.insert(i, '') else: original_tokens.insert(i, '') else: add_empty = False for i, token in enumerate(original_tokens): if i < len(simplified_tokens): if normalize_unicode( token.lower()) == simplified_tokens[i]: add_empty = False else: if not add_empty: add_empty = True continue else: simplified_tokens.insert(i, '') else: simplified_tokens.insert(i, '') while len(original_tokens) != len(simplified_tokens): if len(original_tokens) > len(simplified_tokens): original_tokens.remove('') else: simplified_tokens.remove('') return original_tokens, simplified_tokens
def test_normalized_relative_dates(self, date_string, ago, period): date_string = normalize_unicode(date_string) self.given_parser(settings={'NORMALIZE': True}) self.given_date_string(date_string) self.when_date_is_parsed() self.then_error_was_not_raised() self.then_date_was_parsed_by_freshness_parser() self.then_date_obj_is_exactly_this_time_ago(ago) self.then_period_is(period)
def is_applicable(self, date_string, strip_timezone=False, settings=None): if settings.NORMALIZE: date_string = normalize_unicode(date_string) if strip_timezone: date_string, _ = pop_tz_offset_from_string(date_string, as_offset=False) date_string = self._simplify(date_string, settings=settings) tokens = self._split(date_string, keep_formatting=False, settings=settings) if self._is_date_consists_of_digits_only(tokens): return True else: return self._are_all_words_in_the_dictionary(tokens, settings)
def translate(self, date_string, keep_formatting=False, settings=None): if settings.NORMALIZE: date_string = normalize_unicode(date_string) date_string = self._simplify(date_string, settings=settings) words = self._split(date_string, keep_formatting, settings=settings) dictionary = self._get_dictionary(settings) for i, word in enumerate(words): word = word.lower() if word in dictionary: words[i] = dictionary[word] or '' return self._join( list(filter(bool, words)), separator="" if keep_formatting else " ", settings=settings)
def translate(self, date_string, keep_formatting=False, settings=None): if settings.NORMALIZE: date_string = normalize_unicode(date_string) date_string = self._simplify(date_string, settings=settings) words = self._split(date_string, keep_formatting, settings=settings) dictionary = self._get_dictionary(settings) for i, word in enumerate(words): word = word.lower() if word in dictionary: words[i] = dictionary[word] or '' return self._join(list(filter(bool, words)), separator="" if keep_formatting else " ", settings=settings)
def _best_language(self, date_string, settings=None): self.character_check(date_string, settings) date_string = normalize_unicode(date_string.lower()) if len(self.languages) == 1: return self.languages[0].shortname applicable_languages = [] for language in self.languages: num_words = language.count_applicability( date_string, strip_timezone=False, settings=settings) if num_words[0] > 0 or num_words[1] > 0: applicable_languages.append((language.shortname, num_words)) else: num_words = language.count_applicability( date_string, strip_timezone=True, settings=settings) if num_words[0] > 0 or num_words[1] > 0: applicable_languages.append((language.shortname, num_words)) if not applicable_languages: return None return max(applicable_languages, key=lambda p: (p[1][0], p[1][1]))[0]
def _best_language(self, date_string, settings=None): self.character_check(date_string, settings) date_string = normalize_unicode(date_string.lower()) if len(self.languages) == 1: return self.languages[0].shortname applicable_languages = [] for language in self.languages: num_words = language.count_applicability(date_string, strip_timezone=False, settings=settings) if num_words[0] > 0 or num_words[1] > 0: applicable_languages.append((language.shortname, num_words)) else: num_words = language.count_applicability(date_string, strip_timezone=True, settings=settings) if num_words[0] > 0 or num_words[1] > 0: applicable_languages.append( (language.shortname, num_words)) if not applicable_languages: return None return max(applicable_languages, key=lambda p: (p[1][0], p[1][1]))[0]
def translate(self, date_string, keep_formatting=False, settings=None): """ Translate the date string to its English equivalent. :param date_string: A string representing date and/or time in a recognizably valid format. :type date_string: str :param keep_formatting: If True, retain formatting of the date string after translation. :type keep_formatting: bool :return: translated date string. """ date_string = self._translate_numerals(date_string) if settings.NORMALIZE: date_string = normalize_unicode(date_string) date_string = self._simplify(date_string, settings=settings) dictionary = self._get_dictionary(settings) date_string_tokens = dictionary.split(date_string, keep_formatting) relative_translations = self._get_relative_translations( settings=settings) for i, word in enumerate(date_string_tokens): word = word.lower() for pattern, replacement in relative_translations.items(): if pattern.match(word): date_string_tokens[i] = pattern.sub(replacement, word) else: if word in dictionary: date_string_tokens[i] = dictionary[word] or '' if "in" in date_string_tokens: date_string_tokens = self._clear_future_words(date_string_tokens) return self._join(list(filter(bool, date_string_tokens)), separator="" if keep_formatting else " ", settings=settings)
def get_date_data(self, date_string, date_formats=None): """ Parse string representing date and/or time in recognizable localized formats. Supports parsing multiple languages and timezones. :param date_string: A string representing date and/or time in a recognizably valid format. :type date_string: str|unicode :param date_formats: A list of format strings using directives as given `here <https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior>`_. The parser applies formats one by one, taking into account the detected languages. :type date_formats: list :return: a dict mapping keys to :mod:`datetime.datetime` object and *period*. For example: {'date_obj': datetime.datetime(2015, 6, 1, 0, 0), 'period': u'day'} :raises: ValueError - Unknown Language .. note:: *Period* values can be a 'day' (default), 'week', 'month', 'year'. *Period* represents the granularity of date parsed from the given string. In the example below, since no day information is present, the day is assumed to be current day ``16`` from *current date* (which is June 16, 2015, at the moment of writing this). Hence, the level of precision is ``month``: >>> DateDataParser().get_date_data(u'March 2015') {'date_obj': datetime.datetime(2015, 3, 16, 0, 0), 'period': u'month'} Similarly, for date strings with no day and month information present, level of precision is ``year`` and day ``16`` and month ``6`` are from *current_date*. >>> DateDataParser().get_date_data(u'2014') {'date_obj': datetime.datetime(2014, 6, 16, 0, 0), 'period': u'year'} Dates with time zone indications or UTC offsets are returned in UTC time unless specified using `Settings`_. >>> DateDataParser().get_date_data(u'23 March 2000, 1:21 PM CET') {'date_obj': datetime.datetime(2000, 3, 23, 14, 21), 'period': 'day'} """ try: date_string = date_string.strip() except AttributeError: raise TypeError('Input type must be str or unicode') if self._settings.NORMALIZE: date_string = normalize_unicode(date_string) date_string = sanitize_date(date_string) for language in self.language_detector.iterate_applicable_languages( date_string, modify=True, settings=self._settings): parsed_date = _DateLanguageParser.parse(language, date_string, date_formats, settings=self._settings) if parsed_date: return parsed_date else: return {'date_obj': None, 'period': 'day'}
def given_string(self, datetime_string): if settings.NORMALIZE: datetime_string = normalize_unicode(datetime_string) self.datetime_string = datetime_string