def test_check_settings_extra_check_require_parts(self): with self.assertRaisesRegex( SettingValidationError, r'"REQUIRE_PARTS" setting contains invalid values: time' ): DateDataParser(settings={'REQUIRE_PARTS': ['time', 'day']}) with self.assertRaisesRegex( SettingValidationError, r'There are repeated values in the "REQUIRE_PARTS" setting' ): DateDataParser(settings={'REQUIRE_PARTS': ['month', 'day', 'month']})
def get_mount_number(name: str) -> int: """ Get month number by name Use `dateparser` for cross-platform solution. Develop on Mac os, use on Windows Because native solution have different name in module calendar, for example on Windows `Январь` on Mac Os `января` """ ddp = DateDataParser(languages=['ru']) date_data = ddp.get_date_data(f'1 {name}') return date_data.date_obj.month
def test_check_settings_extra_check_parsers(self): with self.assertRaisesRegex( SettingValidationError, r'Found unknown parsers in the "PARSERS" setting: no-spaces' ): DateDataParser(settings={'PARSERS': ['absolute-time', 'no-spaces']}) with self.assertRaisesRegex( SettingValidationError, r'There are repeated values in the "PARSERS" setting' ): DateDataParser(settings={'PARSERS': ['absolute-time', 'timestamp', 'absolute-time']})
def _parse_date(date_string: str) -> datetime: # NOTE:这里 new DateDataParser() 对象,避免上一次的判断条件会产生影响,将该函数变成 non state 的 from dateparser import DateDataParser data = DateDataParser(try_previous_locales=False).get_date_data( date_string, None) if data: return data['date_obj']
def __init__(self, column, form): parser = DateDataParser(languages=['en'], allow_redetect_language=False) def fn(df, column=column, format=form, parser=parser): N = df.shape[0] for i in range(N): if df[column].iloc[i] != None: try: df[column].iloc[i] = parser.get_date_data( str(df[column].iloc[i]))['date_obj'].strftime(form) except: pass return df self.name = 'df = dateparse(df,' + formatString( column) + ',' + formatString(form) + ')' self.provenance = [self] super(DatetimeCast, self).__init__(fn, ['column', 'form'])
def test_check_settings(self, setting, wrong_type, wrong_value, valid_value): with self.assertRaisesRegex( SettingValidationError, r'"{}" must be .*, not "{}".'.format(setting, type(wrong_type).__name__) ): DateDataParser(settings={setting: wrong_type}) if wrong_value: with self.assertRaisesRegex( SettingValidationError, r'"{}" is not a valid value for "{}", it should be: .*'.format( str(wrong_value).replace('[', '\\[').replace(']', '\\]'), setting ) ): DateDataParser(settings={setting: wrong_value}) # check that a valid value doesn't raise an error assert DateDataParser(settings={setting: valid_value})
def test_check_settings_extra_check_confidence_threshold(self): with self.assertRaisesRegex( SettingValidationError, r'1.1 is not a valid value for ' r'LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD. It can take values ' r'between 0 and 1' ): DateDataParser(settings={'LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD': 1.1})
def test_no_spaces_strict_parsing(date_string, expected_result): parser = DateDataParser(settings={ 'PARSERS': ['no-spaces-time'], 'STRICT_PARSING': False }) assert parser.get_date_data(date_string)['date_obj'] == expected_result parser = DateDataParser(settings={ 'PARSERS': ['no-spaces-time'], 'STRICT_PARSING': True }) assert parser.get_date_data(date_string)['date_obj'] is None
class InputParser: LANGUAGES = ["en"] DATE_PARSER_SETTINGS = { "STRICT_PARSING": False, "NORMALIZE": True, "RETURN_AS_TIMEZONE_AWARE": True, "PREFER_DATES_FROM": "past", } def __init__(self): self._date_parser = DateDataParser(languages=self.LANGUAGES, settings=self.DATE_PARSER_SETTINGS) def _as_datetime(self, date_time): dt_obj = self._date_parser.get_date_data(date_time) if dt_obj: return dt_obj["date_obj"] def parse_datetime(self, date_time): dt = date_time.strip() if not dt: return None parsed_dt = self._as_datetime(dt) if not parsed_dt: raise InvalidDateTimeError if parsed_dt > self._as_datetime("now"): raise DateTimeInFutureError return parsed_dt def parse(self, work_desc): if "@" in work_desc: work, _, date_time = work_desc.rpartition("@") else: work, date_time = (work_desc, "") work = work.strip() if not work: raise InvalidWorkError date_time = self.parse_datetime(date_time) return work, date_time
# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re from dateparser import DateDataParser from functools import lru_cache parser = DateDataParser(languages=['en']) def rank(l, user): # hack hack return sorted(l, key=lambda x: x.user) def uprank(l, users): # hack hack # score is a sorting order; lower comes first. def score(n): score = 0 if n.user in users: # the earlier in the list a user comes, the more highly ranked it is. score = users.index(n.user) - len(users) - 1
def test_check_settings_wrong_setting_name(self): with self.assertRaisesRegex(SettingValidationError, r'.* is not a valid setting'): DateDataParser(settings={'AAAAA': 'foo'})
import datetime import logging import re from functools import lru_cache # coverage for date parsing from dateparser import DateDataParser # third-party, slow from dateparser_data.settings import default_parsers EXTERNAL_PARSER = DateDataParser( settings={ # 'DATE_ORDER': 'DMY', 'PREFER_DATES_FROM': 'past', # 'PREFER_DAY_OF_MONTH': 'first', 'STRICT_PARSING': True, 'PARSERS': [ p for p in default_parsers if p not in ('no-spaces-time', 'relative-time', 'timestamp') ], }) from dateutil.parser import parse as dateutil_parse # own from .settings import CACHE_SIZE from .validators import convert_date, date_validator LOGGER = logging.getLogger(__name__)
from datetime import datetime import re from dateparser import DateDataParser import pandas as pd from gestion_erreurs import ajout_erreur # on crée un analyseur de dates pour le français DDP = DateDataParser(languages=["fr"]) # on stocke la date du jour d'exécution pour filtrer les dates mal reconnues # (eg. si la date de signature extraite est postérieure à la date du jour) _TODAY = datetime.now() RE_DOC_ID = re.compile( r"N°[ ]*(?P<doc_id>\d{4}[ ]?[-_]?[ ]?\d{4,5}[B]?[ ]?[-_.]?[ ]?VDM[A]?)") def extract_doc_id(doc_txt): """Extrait l'identifiant de l'arrêté: année_num_VDM année sur 4 chiffres, num sur 5 chiffres, VDM pour Ville De Marseille ? Parameters ---------- doc_txt : string Texte du document Returns ------- doc_id : string or None
def __init__(self): self._date_parser = DateDataParser(languages=self.LANGUAGES, settings=self.DATE_PARSER_SETTINGS)
def test_confidence_threshold_setting_is_applied(): ddp = DateDataParser(detect_languages_function=detect_languages, settings={'LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD': 0.6}) assert ddp.get_date_data('21/06/2020').locale == 'en' ddp2 = DateDataParser(detect_languages_function=detect_languages, settings={'LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD': 0.4}) assert ddp2.get_date_data('21/06/2020').locale == 'fr'
def test_check_settings_extra_check_default_languages(self): with self.assertRaisesRegex( SettingValidationError, "Found invalid languages in the 'DEFAULT_LANGUAGES' setting: 'abcd'" ): DateDataParser(settings={'DEFAULT_LANGUAGES': ["abcd"]})