def parse_date_string(self, date_string: str, captures: Dict[str, List], locale: Locale): # For well formatted string, we can already let dateparser parse them # otherwise self._find_and_replace method might corrupt them was_raised_error = False as_dt = None if not locale: try: as_dt = dateparser.parse( date_string, settings={'RELATIVE_BASE': self.base_date}) # Dateparser has issues with time when parsing something like `29MAY19 1350` as_dateutil = parser.parse(date_string, default=self.base_date) if as_dt != as_dateutil: as_dt = as_dateutil except ValueError: was_raised_error = True else: try: print(date_string, self.base_date, type(locale)) as_dt = dateparser.parse( date_string, settings={'RELATIVE_BASE': self.base_date}, locales=[locale.get_locale()]) except ValueError: was_raised_error = True # Try to parse date using only language if was_raised_error: try: as_dt = dateparser.parse( date_string, settings={'RELATIVE_BASE': self.base_date}, languages=[locale.language]) was_raised_error = False except ValueError: pass if was_raised_error: # replace tokens that are problematic for dateutil date_string, tz_string = self._find_and_replace( date_string, captures) # One last sweep after removing date_string = date_string.strip(self.STRIP_CHARS) # Match strings must be at least 3 characters long # < 3 tends to be garbage if len(date_string) < 3: return None try: debug_msg = 'Parsing {} with dateutil'.format(date_string) logger.debug(debug_msg) as_dt = parser.parse(date_string, default=self.base_date) except Exception as e: # pylint: disable=broad-except logger.debug(e) as_dt = None if tz_string: as_dt = self._add_tzinfo(as_dt, tz_string) return as_dt
def test_locales_convert(self): data = [ { 'input': 'en', 'output_locale_code': 'EN' }, { 'input': 'en-US', 'output_locale_code': 'US' }, { 'input': 'en/Gb', 'output_locale_code': 'GB' }, { 'input': 'En_us', 'output_locale_code': 'US' }, ] output_language_code = 'en' for item in data: locale_obj = Locale(item['input']) self.assertEqual(locale_obj.language, output_language_code) self.assertEqual(locale_obj.locale_code, item['output_locale_code'])
def test_dates(self): text = """ Ausfertigungsdatum: 23.05.1975 Vollzitat: \ "Gesetz über vermögenswirksame Leistungen für Beamte, Richter, Berufssoldaten und \ Soldaten auf Zeit in der Fassung der Bekanntmachung vom 16. Mai 2002 (BGBl. I S. 1778), \ das zuletzt durch Artikel 39 des Gesetzes vom 29. März 2017 (BGBl. I S. 626) geändert worden ist" \ Stand: Neugefasst durch Bek. v. 16.5.2002 I 1778; \ zuletzt geändert durch Art. 39 G v. 29.3.2017 I 626""".strip() ds = get_date_list(text=text, locale=Locale('de')) self.assertEqual(5, len(ds)) ds.sort(key=lambda d: d['location_start']) self.assertEqual((20, 30), (ds[0]['location_start'], ds[0]['location_end'])) self.assertEqual((196, 208), (ds[1]['location_start'], ds[1]['location_end'])) self.assertEqual((282, 295), (ds[2]['location_start'], ds[2]['location_end'])) self.assertEqual((381, 390), (ds[3]['location_start'], ds[3]['location_end'])) self.assertEqual((443, 452), (ds[4]['location_start'], ds[4]['location_end'])) self.assertEqual(datetime.datetime(1975, 5, 23, 0, 0), ds[0]['value']) self.assertEqual(datetime.datetime(2002, 5, 16, 0, 0), ds[1]['value']) self.assertEqual(datetime.datetime(2017, 3, 29, 0, 0), ds[2]['value']) self.assertEqual(datetime.datetime(2002, 5, 16, 0, 0), ds[3]['value']) self.assertEqual(datetime.datetime(2017, 3, 29, 0, 0), ds[4]['value']) self.assertEqual('23.05.1975', ds[0]['source']) self.assertEqual('16. Mai 2002', ds[1]['source']) self.assertEqual('29. März 2017', ds[2]['source']) self.assertEqual('16.5.2002', ds[3]['source']) self.assertEqual('29.3.2017', ds[4]['source'])
def get_copyright_annotations( locale: str, text: str, return_sources: bool = False) -> \ Generator[CopyrightAnnotation, None, None]: routine = ROUTINE_BY_LOCALE.get( Locale(locale).language, ROUTINE_BY_LOCALE[DEFAULT_LANGUAGE.code]) yield from routine(text, return_sources)
def get_definition_annotations( locale: str, text: str, **kwargs) \ -> Generator[DefinitionAnnotation, None, None]: routine = ROUTINE_BY_LOCALE.get( Locale(locale).language, ROUTINE_BY_LOCALE[DEFAULT_LANGUAGE.code]) yield from routine(text, **kwargs)
def get_duration_annotations( locale: str, text: str, float_digits: int = 4) \ -> Generator[DurationAnnotation, None, None]: routine = ROUTINE_BY_LOCALE.get( Locale(locale).language, ROUTINE_BY_LOCALE[DEFAULT_LANGUAGE.code]) yield from routine(text, float_digits)
def get_date_annotations( text: str, strict: Optional[bool] = None, locale: Optional[str] = '', _base_date: Optional[datetime.datetime] = None, _threshold: float = 0.50) -> Generator[DateAnnotation, None, None]: strict = strict if strict is not None else False yield from parser.get_date_annotations(text, Locale(locale), strict)
def get_date_annotations( locale: str, text: str, strict: Optional[bool] = None, base_date: Optional[datetime] = None, threshold: float = 0.50) -> Generator[DateAnnotation, None, None]: routine = ROUTINE_BY_LOCALE.get( Locale(locale).language, ROUTINE_BY_LOCALE[DEFAULT_LANGUAGE.code]) yield from routine(text, strict, locale, base_date, threshold)
def get_amount_annotations( locale: str, text: str, extended_sources: bool = True, float_digits: int = 4, ) -> Generator[AmountAnnotation, None, None]: routine = ROUTINE_BY_LOCALE.get( Locale(locale).language, ROUTINE_BY_LOCALE[DEFAULT_LANGUAGE.code]) yield from routine(text, extended_sources, float_digits)
def __init__(self, text: Optional[str] = None, locale: Locale = Locale('en-US'), dateparser_settings: Optional[Dict[str, Any]] = None, enable_classifier_check: bool = False, classifier_model: Optional[Any] = None, classifier_threshold: float = 0.5): super().__init__(DATE_MODEL_CHARS, text, locale, dateparser_settings, enable_classifier_check, classifier_model, classifier_threshold)
def get_court_annotations( locale: str, text: str, court_config_list: List[DictionaryEntry], priority: bool = False, text_locales: List[str] = (), simplified_normalization: bool = False ) -> Generator[CourtAnnotation, None, None]: locale_obj = Locale(locale) dic_entries = find_dict_entities( text, court_config_list, default_language=locale_obj.language, conflict_resolving_func=conflicts_take_first_by_id if priority else None, text_languages=[Locale(item).language for item in text_locales], simplified_normalization=simplified_normalization) for ent in dic_entries: ant = CourtAnnotation(coords=ent.coords) if ent.entity[0]: toponim = ent.entity[0] # type: DictionaryEntry ant.entity_id = toponim.id ant.entity_category = toponim.category ant.entity_priority = toponim.priority ant.name_en = toponim.entity_name ant.name = toponim.name if toponim.extra_columns: for extr_col in toponim.extra_columns: setattr(ant, extr_col, toponim.extra_columns[extr_col]) if ent.entity[1]: # alias ant.alias = ent.entity[1].alias ant.locale = ent.entity[1].language if not ant.locale: ant.locale = locale_obj.language yield ant
def get_geoentity_annotations( locale: str, text: str, geo_config_list: List[DictionaryEntry], conflict_resolving_field: str = 'none', priority_direction: str = 'asc', text_languages: List[str] = None, min_alias_len: Optional[int] = None, prepared_alias_ban_list: Optional[Dict[str, Tuple[List[str], List[str]]]] = None, simplified_normalization: bool = False ) -> Generator[GeoAnnotation, None, None]: routine = ROUTINE_BY_LOCALE.get( Locale(locale).language, ROUTINE_BY_LOCALE[DEFAULT_LANGUAGE.code]) yield from routine(text, geo_config_list, conflict_resolving_field, priority_direction, text_languages, min_alias_len, prepared_alias_ban_list, simplified_normalization)
def get_date_annotations(text: str, strict: Optional[bool] = None, locale: Optional[str] = '', base_date: Optional[datetime.datetime] = None, threshold: float = 0.50) \ -> Generator[DateAnnotation, None, None]: """ Find dates after cleaning false positives. :param text: raw text to search :param strict: whether to return only complete or strict matches :param locale: locale string :param base_date: base date to use for implied or partial matches :param threshold: probability threshold to use for false positive classifier :return: """ # Get raw dates strict = strict if strict is not None else False raw_date_results = get_raw_date_list(text, strict=strict, base_date=base_date, return_source=True, locale=Locale(locale)) for raw_date in raw_date_results: features_dict = get_date_features(text, raw_date[1][0], raw_date[1][1], characters=DATE_MODEL_CHARS) row_df = DateFeaturesDataframeBuilder.build_feature_df(features_dict) # row_df = pd.DataFrame([get_date_features(text, raw_date[1][0], raw_date[1][1])]) date_score = MODEL_DATE.predict_proba(row_df.loc[:, MODEL_DATE.columns]) if date_score[0, 1] >= threshold: ant = DateAnnotation(coords=raw_date[1], date=raw_date[0], score=date_score[0, 1]) yield ant
def __init__(self, characters: List[str], text: Optional[str] = None, locale: Locale = Locale('en-US'), dateparser_settings: Optional[Dict[str, Any]] = None, enable_classifier_check: bool = True, classifier_model: Optional[Any] = None, classifier_threshold: float = 0.5): """ :param locale: locale object with language code and locale code :param enable_classifier_check: bool - enable date check using classifier model :param classifier_model: obj - classifier itself :param classifier_threshold: float 0<x<1 - min value to predict date :param dateparser_settings: dict - settings for dateparser """ self.characters = characters self.locale = locale self.text = text self.dates = [] self.enable_classifier_check = enable_classifier_check self.classifier_model = classifier_model self.classifier_threshold = classifier_threshold self.dateparser_settings = dateparser_settings or self.DEFAULT_DATEPARSER_SETTINGS
("{0} through {1}".format(d.isoformat(), d2.isoformat()), [d, d2])) examples.append( ("{0} through {1}".format(d.strftime("%b d, %Y"), d2.strftime("%b d, %Y")), [d, d2])) except ValueError: continue # Output output_path = 'test_date_model.pickle' if save: output_path = os.path.join(MODULE_PATH, 'date_model.pickle') build_date_model(examples, output_path, lambda date_str: get_raw_date_list( date_str, strict=False, return_source=True), characters=DATE_MODEL_CHARS) if not save: os.unlink("test_date_model.pickle") parser = DateParser(DATE_MODEL_CHARS, enable_classifier_check=True, locale=Locale('en-US'), classifier_model=MODEL_DATE) _get_dates = parser.get_dates _get_date_list = parser.get_date_list
dates = list(dateparser_dates_dict.values()) for w_date_re, w_date_norm in self.WEIRD_DATES_NORM: w_dates = w_date_re.findall(self.text) for w_date_str in w_dates: date_str = w_date_norm(w_date_str) date_res = self.get_dateparser_dates(date_str, strict) if date_res: dates.append((w_date_str, date_res[0][1])) self.dates = dates parser = ESDateParser(enable_classifier_check=False, locale=Locale('es-ES'), dateparser_settings={ 'PREFER_DAY_OF_MONTH': 'first', 'STRICT_PARSING': False, 'DATE_ORDER': 'DMY' }) def get_date_annotations( text: str, strict: Optional[bool] = None, locale: Optional[str] = '', _base_date: Optional[datetime.datetime] = None, _threshold: float = 0.50) -> Generator[DateAnnotation, None, None]: strict = strict if strict is not None else False yield from parser.get_date_annotations(text, Locale(locale), strict)
def get_court_citation_annotations(locale: str, text: str, language: str = None) -> \ Generator[CourtCitationAnnotation, None, None]: routine = ROUTINE_BY_LOCALE.get( Locale(locale).language, ROUTINE_BY_LOCALE[LANG_DE.code]) yield from routine(text, language)
import joblib from lexnlp.extract.all_locales.languages import Locale from lexnlp.extract.common.annotations.date_annotation import DateAnnotation from lexnlp.extract.common.dates import DateParser from lexnlp.extract.de.date_model import DATE_MODEL_CHARS # Setup path MODULE_PATH = os.path.dirname(os.path.abspath(__file__)) # Load model MODEL_DATE = joblib.load(os.path.join(MODULE_PATH, "./date_model.pickle")) parser = DateParser(DATE_MODEL_CHARS, enable_classifier_check=True, locale=Locale('de-DE'), dateparser_settings={ 'PREFER_DAY_OF_MONTH': 'first', 'STRICT_PARSING': False, 'DATE_ORDER': 'DMY' }, classifier_model=MODEL_DATE) def get_date_annotations( text: str, strict: Optional[bool] = None, locale: Optional[str] = '', _base_date: Optional[datetime] = None, _threshold: float = 0.50) -> Generator[DateAnnotation, None, None]: strict = strict if strict is not None else False