def test_should_accept_dates_in_different_languages(self):
        date_fixtures = [
            (u'13 Ago, 2014', datetime(2014, 8, 13)),
            (u'13 Septiembre, 2014', datetime(2014, 9, 13)),
            (u'13 Setembro, 2014', datetime(2014, 9, 13)),
        ]
        parser = AutoDetectLanguage(None, allow_redetection=True)

        for date_string, correct_date in date_fixtures:
            parsed_date = parser.parse(date_string, None)
            self.assertEqual(correct_date.date(), parsed_date.date())
Beispiel #2
0
    def __init__(self, languages=None, allow_redetect_language=False):
        if isinstance(languages, (list, tuple, collections.Set)):
            available_language_map = default_language_loader.get_language_map()

            if all([language in available_language_map for language in languages]):
                languages = [available_language_map[language] for language in languages]
            else:
                unsupported_languages = set(languages) - set(available_language_map.keys())
                raise ValueError("Unknown language(s) %r" % ', '.join(unsupported_languages))
        elif languages is not None:
            raise TypeError("languages argument must be a list (%r given)"  % type(languages))

        if allow_redetect_language:
            self.language_detector = AutoDetectLanguage(languages=languages if languages else None,
                                                        allow_redetection=True)
        elif languages:
            self.language_detector = ExactLanguages(languages=languages)
        else:
            self.language_detector = AutoDetectLanguage(languages=None, allow_redetection=False)
class AutoDetectLanguageTest(unittest.TestCase):

    def setUp(self):
        self.parser = AutoDetectLanguage()

    def test_detect_language(self):
        self.assertItemsEqual(['es', 'pt'],
                              map(attrgetter('shortname'), self.parser.iterate_applicable_languages('11 abril 2010')))
        self.assertItemsEqual(['es'],
                              map(attrgetter('shortname'), self.parser.iterate_applicable_languages('11 junio 2010')))

    @unittest.skip('This test should only be testing detecting languages, not parsing them. Although tests '
                   'for parsing this dates should be created separately to not reduce the coverage')
    def test_should_reduce_possible_languages_and_reject_different(self):
        dates_in_spanish = [
            (u'13 Ago, 2014', datetime(2014, 8, 13)),
            (u'13 Septiembre, 2014', datetime(2014, 9, 13)),
        ]

        for date_string, correct_date in dates_in_spanish:
            parsed_date = self.parser.parse(date_string, None)
            self.assertEqual(correct_date.date(), parsed_date.date())

        with self.assertRaisesRegexp(ValueError, 'Invalid date'):
            portuguese_date = u'13 Setembro, 2014'
            self.parser.parse(portuguese_date, None)

    @unittest.skip('This test should only be testing detecting languages, not parsing them. Although tests '
                   'for parsing this dates should be created separately to not reduce the coverage')
    def test_should_accept_dates_in_different_languages(self):
        date_fixtures = [
            (u'13 Ago, 2014', datetime(2014, 8, 13)),
            (u'13 Septiembre, 2014', datetime(2014, 9, 13)),
            (u'13 Setembro, 2014', datetime(2014, 9, 13)),
        ]
        parser = AutoDetectLanguage(None, allow_redetection=True)

        for date_string, correct_date in date_fixtures:
            parsed_date = parser.parse(date_string, None)
            self.assertEqual(correct_date.date(), parsed_date.date())
Beispiel #4
0
class DateDataParser(object):

    def __init__(self, languages=None, allow_redetect_language=False):
        if isinstance(languages, (list, tuple, collections.Set)):
            available_language_map = default_language_loader.get_language_map()

            if all([language in available_language_map for language in languages]):
                languages = [available_language_map[language] for language in languages]
            else:
                unsupported_languages = set(languages) - set(available_language_map.keys())
                raise ValueError("Unknown language(s) %r" % ', '.join(unsupported_languages))
        elif languages is not None:
            raise TypeError("languages argument must be a list (%r given)"  % type(languages))

        if allow_redetect_language:
            self.language_detector = AutoDetectLanguage(languages=languages if languages else None,
                                                        allow_redetection=True)
        elif languages:
            self.language_detector = ExactLanguages(languages=languages)
        else:
            self.language_detector = AutoDetectLanguage(languages=None, allow_redetection=False)

    def get_date_data(self, date_string, date_formats=None):
        """ Return a dictionary with a date object and a period.
        Period values can be a 'day' (default), 'week', 'month', 'year'.
        It aims to solve the following issue:
        In example, a forum could displays "2 weeks ago" in the thread list
        (in the thread itself there's the right date) so the engine
        will translate "2 weeks ago" to a certain date.
        The next thread summary displays "3 weeks ago" which is translated
        to a other date seven days before first date.
        A valid date_string between both dates won't be scraped because
        it's not an exact date match. The period field helps to build
        better date range detection.

        TODO: Timezone issues

        """
        date_string = date_string.strip()
        date_string = sanitize_date(date_string)

        for language in self.language_detector.iterate_applicable_languages(
                date_string, modify=True):
            parsed_date = _DateLanguageParser.parse(language, date_string, date_formats)
            if parsed_date:
                return parsed_date
        else:
            return {'date_obj': None, 'period': 'day'}
Beispiel #5
0
class DateDataParser(object):
    """
    Class which handles language detection, translation and subsequent generic parsing of
    string representing date and/or time.

    :param languages:
            A list of two letters language codes, e.g. ['en', 'es'].
            If languages are given, it will not attempt to detect the language.
    :type languages: list

    :param allow_redetect_language:
            Enables/disables language re-detection.
    :type allow_redetect_language: bool

    :param settings:
           Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.
    :type settings: dict

    :return: A parser instance

    :raises:
            ValueError - Unknown Language, TypeError - Languages argument must be a list
    """
    language_loader = None

    @apply_settings
    def __init__(self,
                 languages=None,
                 allow_redetect_language=False,
                 settings=None):
        self._settings = settings
        available_language_map = self._get_language_loader().get_language_map()

        if isinstance(languages, (list, tuple, collections.Set)):

            if all(
                [language in available_language_map
                 for language in languages]):
                languages = [
                    available_language_map[language] for language in languages
                ]
            else:
                unsupported_languages = set(languages) - set(
                    available_language_map.keys())
                raise ValueError("Unknown language(s): %s" %
                                 ', '.join(map(repr, unsupported_languages)))
        elif languages is not None:
            raise TypeError("languages argument must be a list (%r given)" %
                            type(languages))

        if allow_redetect_language:
            self.language_detector = AutoDetectLanguage(
                languages
                if languages else list(available_language_map.values()),
                allow_redetection=True)
        elif languages:
            self.language_detector = ExactLanguages(languages=languages)
        else:
            self.language_detector = AutoDetectLanguage(
                list(available_language_map.values()), allow_redetection=False)

    def get_date_data(self, date_string, date_formats=None):
        """
        Parse string representing date and/or time in recognizable localized formats.
        Supports parsing multiple languages and timezones.

        :param date_string:
            A string representing date and/or time in a recognizably valid format.
        :type date_string: str|unicode
        :param date_formats:
            A list of format strings using directives as given
            `here <https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior>`_.
            The parser applies formats one by one, taking into account the detected languages.
        :type date_formats: list

        :return: a dict mapping keys to :mod:`datetime.datetime` object and *period*. For example:
            {'date_obj': datetime.datetime(2015, 6, 1, 0, 0), 'period': u'day'}

        :raises: ValueError - Unknown Language

        .. note:: *Period* values can be a 'day' (default), 'week', 'month', 'year'.

        *Period* represents the granularity of date parsed from the given string.

        In the example below, since no day information is present, the day is assumed to be current
        day ``16`` from *current date* (which is June 16, 2015, at the moment of writing this).
        Hence, the level of precision is ``month``:

            >>> DateDataParser().get_date_data(u'March 2015')
            {'date_obj': datetime.datetime(2015, 3, 16, 0, 0), 'period': u'month'}

        Similarly, for date strings with no day and month information present, level of precision
        is ``year`` and day ``16`` and month ``6`` are from *current_date*.

            >>> DateDataParser().get_date_data(u'2014')
            {'date_obj': datetime.datetime(2014, 6, 16, 0, 0), 'period': u'year'}

        Dates with time zone indications or UTC offsets are returned in UTC time unless
        specified using `Settings`_.

            >>> DateDataParser().get_date_data(u'23 March 2000, 1:21 PM CET')
            {'date_obj': datetime.datetime(2000, 3, 23, 14, 21), 'period': 'day'}

        """
        if not (isinstance(date_string, six.text_type)
                or isinstance(date_string, six.string_types)):
            raise TypeError('Input type must be str or unicode')

        res = parse_with_formats(date_string, date_formats or [],
                                 self._settings)
        if res['date_obj']:
            return res

        if self._settings.NORMALIZE:
            date_string = normalize_unicode(date_string)

        date_string = sanitize_date(date_string)

        for language in self.language_detector.iterate_applicable_languages(
                date_string, modify=True, settings=self._settings):
            parsed_date = _DateLanguageParser.parse(language,
                                                    date_string,
                                                    date_formats,
                                                    settings=self._settings)
            if parsed_date:
                parsed_date['language'] = language.shortname
                return parsed_date
        else:
            return {'date_obj': None, 'period': 'day', 'language': None}

    def get_date_tuple(self, *args, **kwargs):
        date_tuple = collections.namedtuple('DateData',
                                            'date_obj period language')
        date_data = self.get_date_data(*args, **kwargs)
        return date_tuple(**date_data)

    @classmethod
    def _get_language_loader(cls):
        if not cls.language_loader:
            cls.language_loader = LanguageDataLoader()
        return cls.language_loader
class AutoDetectLanguageTest(BaseTestCase):
    def setUp(self):
        super(AutoDetectLanguageTest, self).setUp()

        # Just a known subset so we can rely on test outcomes. Feel free to add, but not exclude or change order.
        self.known_languages = ['en', 'fr', 'es', 'pt', 'ru', 'tr', 'cs']

        self.parser = NotImplemented
        self.detected_languages = NotImplemented

    @parameterized.expand([
        param(date_strings=["11 abril 2010"], expected_languages=['es', 'pt']),
        param(date_strings=["11 junio 2010"], expected_languages=['es']),
        param(date_strings=["13 Ago, 2014", "13 Septiembre, 2014"],
              expected_languages=['es']),
    ])
    def test_detect_languages(self, date_strings, expected_languages):
        self.given_parser(languages=self.known_languages)
        self.when_all_languages_are_detected(date_strings)
        self.then_detected_languages_are(expected_languages)

    @parameterized.expand([
        param(date_strings=["11 abril 2010"], expected_language='es'),
        param(date_strings=["11 junio 2010"], expected_language='es'),
        param(date_strings=["13 Ago, 2014", "13 Septiembre, 2014"],
              expected_language='es'),
    ])
    def test_exclude_ineligible_languages_with_modify(self, date_strings,
                                                      expected_language):
        self.given_parser(languages=self.known_languages)
        self.when_one_language_is_detected(date_strings, modify=True)
        self.then_detected_languages_are([expected_language])
        self.then_parser_languages_are(
            self.known_languages[self.known_languages.index(expected_language
                                                            ):])

    @parameterized.expand([
        param(date_strings=["11 abril 2010"], expected_language='es'),
        param(date_strings=["11 junio 2010"], expected_language='es'),
        param(date_strings=["13 Ago, 2014", "13 Septiembre, 2014"],
              expected_language='es'),
    ])
    def test_do_not_exclude_ineligible_languages_without_modify(
            self, date_strings, expected_language):
        self.given_parser(languages=self.known_languages)
        self.when_one_language_is_detected(date_strings, modify=False)
        self.then_detected_languages_are([expected_language])
        self.then_parser_languages_are(self.known_languages)

    @parameterized.expand([
        param(date_strings=["11 abril 2010"], expected_languages=['es', 'pt']),
        param(date_strings=["11 junio 2010"], expected_languages=['es']),
        param(date_strings=["13 Ago, 2014", "13 Septiembre, 2014"],
              expected_languages=['es']),
        param(date_strings=["13 Srpen, 2014"], expected_languages=['cs']),
    ])
    def test_do_not_exclude_ineligible_languages_when_all_ineligible(
            self, date_strings, expected_languages):
        self.given_parser(languages=self.known_languages)
        self.when_all_languages_are_detected(date_strings, modify=True)
        self.then_detected_languages_are(expected_languages)
        self.then_parser_languages_are(self.known_languages)

    @parameterized.expand([
        param(language='es', date_strings=["13 Setembro, 2014"]),
        param(language='cs', date_strings=["'11 Ağustos, 2014'"]),
    ])
    def test_reject_dates_in_other_languages_without_redetection(
            self, language, date_strings):
        self.given_parser(languages=self.known_languages)
        self.given_parser_languages_are([language])
        self.when_all_languages_are_detected(date_strings)
        self.then_detected_languages_are([])

    @parameterized.expand([
        param(detected_languages=['es'],
              date_strings=['13 Juillet, 2014'],
              expected_languages=['fr']),
        param(detected_languages=['es'],
              date_strings=['11 Ağustos, 2014'],
              expected_languages=['tr']),
    ])
    def test_accept_dates_in_other_languages_with_redetection_enabled(
            self, detected_languages, date_strings, expected_languages):
        self.given_parser(languages=self.known_languages,
                          allow_redetection=True)
        self.given_parser_languages_are(detected_languages)
        self.when_all_languages_are_detected(date_strings)
        self.then_detected_languages_are(expected_languages)

    def test_accept_numeric_dates_without_redetection(self, ):
        self.given_parser(languages=self.known_languages)
        self.given_parser_languages_are(['es'])
        self.when_all_languages_are_detected(['13/08/2014'])
        self.then_detected_languages_are(['es'])

    def given_parser(self, languages=None, allow_redetection=False):
        if languages is not None:
            language_map = default_language_loader.get_language_map()
            languages = [language_map[language] for language in languages]
        self.parser = AutoDetectLanguage(languages,
                                         allow_redetection=allow_redetection)

    def given_parser_languages_are(self, languages):
        language_map = default_language_loader.get_language_map()
        self.parser.languages = [
            language_map[language] for language in languages
        ]

    def when_all_languages_are_detected(self, date_strings, modify=False):
        assert not isinstance(date_strings, six.string_types)
        for date_string in date_strings:
            if settings.NORMALIZE:
                date_string = normalize_unicode(date_string)
            detected_languages = list(
                self.parser.iterate_applicable_languages(date_string,
                                                         modify=modify,
                                                         settings=settings))
        self.detected_languages = detected_languages

    def when_one_language_is_detected(self, date_strings, modify=False):
        for date_string in date_strings:
            detected_language = next(
                self.parser.iterate_applicable_languages(date_string,
                                                         modify=modify,
                                                         settings=settings))
        self.detected_languages = [detected_language]

    def then_detected_languages_are(self, expected_languages):
        shortnames = map(attrgetter('shortname'), self.detected_languages)
        six.assertCountEqual(self, expected_languages, shortnames)

    def then_parser_languages_are(self, expected_languages):
        shortnames = map(attrgetter('shortname'), self.parser.languages)
        six.assertCountEqual(self, expected_languages, shortnames)
 def given_parser(self, languages=None, allow_redetection=False):
     if languages is not None:
         language_map = default_language_loader.get_language_map()
         languages = [language_map[language] for language in languages]
     self.parser = AutoDetectLanguage(languages,
                                      allow_redetection=allow_redetection)
class AutoDetectLanguageTest(BaseTestCase):
    def setUp(self):
        super(AutoDetectLanguageTest, self).setUp()

        # Just a known subset so we can rely on test outcomes. Feel free to add, but not exclude or change order.
        self.known_languages = ['en', 'fr', 'es', 'pt', 'ru', 'tr', 'cs']

        self.parser = NotImplemented
        self.detected_languages = NotImplemented

    @parameterized.expand([
        param(date_strings=["11 abril 2010"], expected_languages=['es', 'pt']),
        param(date_strings=["11 junio 2010"], expected_languages=['es']),
        param(date_strings=["13 Ago, 2014", "13 Septiembre, 2014"], expected_languages=['es']),
    ])
    def test_detect_languages(self, date_strings, expected_languages):
        self.given_parser(languages=self.known_languages)
        self.when_all_languages_are_detected(date_strings)
        self.then_detected_languages_are(expected_languages)

    @parameterized.expand([
        param(date_strings=["11 abril 2010"], expected_language='es'),
        param(date_strings=["11 junio 2010"], expected_language='es'),
        param(date_strings=["13 Ago, 2014", "13 Septiembre, 2014"], expected_language='es'),
    ])
    def test_exclude_ineligible_languages_with_modify(self, date_strings, expected_language):
        self.given_parser(languages=self.known_languages)
        self.when_one_language_is_detected(date_strings, modify=True)
        self.then_detected_languages_are([expected_language])
        self.then_parser_languages_are(self.known_languages[self.known_languages.index(expected_language):])

    @parameterized.expand([
        param(date_strings=["11 abril 2010"], expected_language='es'),
        param(date_strings=["11 junio 2010"], expected_language='es'),
        param(date_strings=["13 Ago, 2014", "13 Septiembre, 2014"], expected_language='es'),
    ])
    def test_do_not_exclude_ineligible_languages_without_modify(self, date_strings, expected_language):
        self.given_parser(languages=self.known_languages)
        self.when_one_language_is_detected(date_strings, modify=False)
        self.then_detected_languages_are([expected_language])
        self.then_parser_languages_are(self.known_languages)

    @parameterized.expand([
        param(date_strings=["11 abril 2010"], expected_languages=['es', 'pt']),
        param(date_strings=["11 junio 2010"], expected_languages=['es']),
        param(date_strings=["13 Ago, 2014", "13 Septiembre, 2014"], expected_languages=['es']),
        param(date_strings=["13 Srpen, 2014"], expected_languages=['cs']),
    ])
    def test_do_not_exclude_ineligible_languages_when_all_ineligible(self, date_strings, expected_languages):
        self.given_parser(languages=self.known_languages)
        self.when_all_languages_are_detected(date_strings, modify=True)
        self.then_detected_languages_are(expected_languages)
        self.then_parser_languages_are(self.known_languages)

    @parameterized.expand([
        param(language='es', date_strings=["13 Setembro, 2014"]),
        param(language='cs', date_strings=["'11 Ağustos, 2014'"]),
    ])
    def test_reject_dates_in_other_languages_without_redetection(self, language, date_strings):
        self.given_parser(languages=self.known_languages)
        self.given_parser_languages_are([language])
        self.when_all_languages_are_detected(date_strings)
        self.then_detected_languages_are([])

    @parameterized.expand([
        param(detected_languages=['es'], date_strings=['13 Juillet, 2014'], expected_languages=['fr']),
        param(detected_languages=['es'], date_strings=['11 Ağustos, 2014'], expected_languages=['tr']),
    ])
    def test_accept_dates_in_other_languages_with_redetection_enabled(
        self, detected_languages, date_strings, expected_languages
    ):
        self.given_parser(languages=self.known_languages, allow_redetection=True)
        self.given_parser_languages_are(detected_languages)
        self.when_all_languages_are_detected(date_strings)
        self.then_detected_languages_are(expected_languages)

    def test_accept_numeric_dates_without_redetection(self,):
        self.given_parser(languages=self.known_languages)
        self.given_parser_languages_are(['es'])
        self.when_all_languages_are_detected(['13/08/2014'])
        self.then_detected_languages_are(['es'])

    def given_parser(self, languages=None, allow_redetection=False):
        if languages is not None:
            language_map = default_language_loader.get_language_map()
            languages = [language_map[language]
                         for language in languages]
        self.parser = AutoDetectLanguage(languages, allow_redetection=allow_redetection)

    def given_parser_languages_are(self, languages):
        language_map = default_language_loader.get_language_map()
        self.parser.languages = [language_map[language]
                                 for language in languages]

    def when_all_languages_are_detected(self, date_strings, modify=False):
        assert not isinstance(date_strings, six.string_types)
        for date_string in date_strings:
            if settings.NORMALIZE:
                date_string = normalize_unicode(date_string)
            detected_languages = list(self.parser.iterate_applicable_languages(date_string, modify=modify, settings=settings))
        self.detected_languages = detected_languages

    def when_one_language_is_detected(self, date_strings, modify=False):
        for date_string in date_strings:
            detected_language = next(self.parser.iterate_applicable_languages(date_string, modify=modify, settings=settings))
        self.detected_languages = [detected_language]

    def then_detected_languages_are(self, expected_languages):
        shortnames = map(attrgetter('shortname'), self.detected_languages)
        six.assertCountEqual(self, expected_languages, shortnames)

    def then_parser_languages_are(self, expected_languages):
        shortnames = map(attrgetter('shortname'), self.parser.languages)
        six.assertCountEqual(self, expected_languages, shortnames)
 def given_parser(self, languages=None, allow_redetection=False):
     if languages is not None:
         language_map = default_language_loader.get_language_map()
         languages = [language_map[language]
                      for language in languages]
     self.parser = AutoDetectLanguage(languages, allow_redetection=allow_redetection)
Beispiel #10
0
class DateDataParser(object):
    """
    Class which handles language detection, translation and subsequent generic parsing of
    string representing date and/or time.

    :param languages:
            A list of two letters language codes, e.g. ['en', 'es'].
            If languages are given, it will not attempt to detect the language.
    :type languages: list

    :param allow_redetect_language:
            Enables/disables language re-detection.
    :type allow_redetect_language: bool

    :param settings:
           Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.
    :type settings: dict

    :return: A parser instance

    :raises:
            ValueError - Unknown Language, TypeError - Languages argument must be a list
    """
    language_loader = None

    @apply_settings
    def __init__(self, languages=None, allow_redetect_language=False, settings=None):
        self._settings = settings
        available_language_map = self._get_language_loader().get_language_map()

        if isinstance(languages, (list, tuple, collections.Set)):

            if all([language in available_language_map for language in languages]):
                languages = [available_language_map[language] for language in languages]
            else:
                unsupported_languages = set(languages) - set(available_language_map.keys())
                raise ValueError(
                    "Unknown language(s): %s" % ', '.join(map(repr, unsupported_languages)))
        elif languages is not None:
            raise TypeError("languages argument must be a list (%r given)" % type(languages))

        if allow_redetect_language:
            self.language_detector = AutoDetectLanguage(
                languages if languages else list(available_language_map.values()),
                allow_redetection=True)
        elif languages:
            self.language_detector = ExactLanguages(languages=languages)
        else:
            self.language_detector = AutoDetectLanguage(
                list(available_language_map.values()), allow_redetection=False)

    def get_date_data(self, date_string, date_formats=None):
        """
        Parse string representing date and/or time in recognizable localized formats.
        Supports parsing multiple languages and timezones.

        :param date_string:
            A string representing date and/or time in a recognizably valid format.
        :type date_string: str|unicode
        :param date_formats:
            A list of format strings using directives as given
            `here <https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior>`_.
            The parser applies formats one by one, taking into account the detected languages.
        :type date_formats: list

        :return: a dict mapping keys to :mod:`datetime.datetime` object and *period*. For example:
            {'date_obj': datetime.datetime(2015, 6, 1, 0, 0), 'period': u'day'}

        :raises: ValueError - Unknown Language

        .. note:: *Period* values can be a 'day' (default), 'week', 'month', 'year'.

        *Period* represents the granularity of date parsed from the given string.

        In the example below, since no day information is present, the day is assumed to be current
        day ``16`` from *current date* (which is June 16, 2015, at the moment of writing this).
        Hence, the level of precision is ``month``:

            >>> DateDataParser().get_date_data(u'March 2015')
            {'date_obj': datetime.datetime(2015, 3, 16, 0, 0), 'period': u'month'}

        Similarly, for date strings with no day and month information present, level of precision
        is ``year`` and day ``16`` and month ``6`` are from *current_date*.

            >>> DateDataParser().get_date_data(u'2014')
            {'date_obj': datetime.datetime(2014, 6, 16, 0, 0), 'period': u'year'}

        Dates with time zone indications or UTC offsets are returned in UTC time unless
        specified using `Settings`_.

            >>> DateDataParser().get_date_data(u'23 March 2000, 1:21 PM CET')
            {'date_obj': datetime.datetime(2000, 3, 23, 14, 21), 'period': 'day'}

        """
        try:
            date_string = date_string.strip()
        except AttributeError:
            raise TypeError('Input type must be str or unicode')
        if self._settings.NORMALIZE:
            date_string = normalize_unicode(date_string)

        date_string = sanitize_date(date_string)

        for language in self.language_detector.iterate_applicable_languages(
                date_string, modify=True, settings=self._settings):
            parsed_date = _DateLanguageParser.parse(
                language, date_string, date_formats, settings=self._settings)
            if parsed_date:
                return parsed_date
        else:
            return {'date_obj': None, 'period': 'day'}

    def get_date_tuple(self, *args, **kwargs):
        date_tuple = collections.namedtuple('DateData', 'date_obj period')
        date_data = self.get_date_data(*args, **kwargs)
        return date_tuple(**date_data)

    @classmethod
    def _get_language_loader(cls):
        if not cls.language_loader:
            cls.language_loader = LanguageDataLoader()
        return cls.language_loader
Beispiel #11
0
 def given_detector(self):
     self.detector = AutoDetectLanguage(
         languages=self.known_languages,
         allow_redetection=self.allow_redetection)
Beispiel #12
0
class DateDataParser(object):
    """
    Class which handles language detection, translation and subsequent generic parsing of
    string representing date and/or time.

    :param languages:
            A list of two letters language codes.e.g. ['en', 'es'].
            If languages are given, it will not attempt to detect the language.
    :type languages: list

    :param allow_redetect_language:
            Enables/disables language re-detection.
    :type allow_redetect_language: bool

    :return: A parser instance

    :raises:
            ValueError - Unknown Language, TypeError - Languages argument must be a list
    """

    def __init__(self, languages=None, allow_redetect_language=False):
        if isinstance(languages, (list, tuple, collections.Set)):
            available_language_map = default_language_loader.get_language_map()

            if all([language in available_language_map for language in languages]):
                languages = [available_language_map[language] for language in languages]
            else:
                unsupported_languages = set(languages) - set(available_language_map.keys())
                raise ValueError("Unknown language(s): %s" % ', '.join(map(repr, unsupported_languages)))
        elif languages is not None:
            raise TypeError("languages argument must be a list (%r given)" % type(languages))

        if allow_redetect_language:
            self.language_detector = AutoDetectLanguage(languages=languages if languages else None,
                                                        allow_redetection=True)
        elif languages:
            self.language_detector = ExactLanguages(languages=languages)
        else:
            self.language_detector = AutoDetectLanguage(languages=None, allow_redetection=False)

    def get_date_data(self, date_string, date_formats=None):
        """
        Parse string representing date and/or time in recognizeable localized formats.
        Supports parsing multiple languages.

        :param date_string:
            A string representing date and/or time in a recognizably valid format.
        :type date_string: str|unicode
        :param date_formats:
            A list of format strings using directives as given
            `here <https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior>`_.
            The parser applies formats one by one, taking into account the detected languages.
        :type date_formats: list

        :return: a dict mapping keys to :mod:`datetime.datetime` object and *period*. For example:
            {'date_obj': datetime.datetime(2015, 6, 1, 0, 0), 'period': u'day'}

        :raises: ValueError - Unknown Language

        .. note:: *Period* values can be a 'day' (default), 'week', 'month', 'year'.

        *Period* represent the granularity of date parsed from the given string.

        In the example below, since no day information is present, the day is assumed to be current
        day ``16`` from *current date* (which is June 16, 2015, at the moment of writing this).
        Hence, the level of precision is ``month``.

            >>> DateDataParser().get_date_data(u'March 2015')
            {'date_obj': datetime.datetime(2015, 3, 16, 0, 0), 'period': u'month'}

        Similarly, for date strings with no day and month information present, level of precision
        is ``year`` and day ``16`` and month ``6`` are from *current_date*.

            >>> DateDataParser().get_date_data(u'2014')
            {'date_obj': datetime.datetime(2014, 6, 16, 0, 0), 'period': u'year'}

        TODO: Timezone issues

        """
        date_string = date_string.strip()
        date_string = sanitize_date(date_string)

        for language in self.language_detector.iterate_applicable_languages(
                date_string, modify=True):
            parsed_date = _DateLanguageParser.parse(language, date_string, date_formats)
            if parsed_date:
                return parsed_date
        else:
            return {'date_obj': None, 'period': 'day'}
Beispiel #13
0
 def setUp(self):
     self.parser = AutoDetectLanguage()
Beispiel #14
0
 def setUp(self):
     self.parser = AutoDetectLanguage()