Example #1
0
    def _process_child(self, child):
        try:
            name = regex_utils.safe_search(self.NAME_PATTERN, child,
                                           self.CHILD_OPTIONS).group("name")
            name = name.strip()
            name = name.strip("-")
            name = name.strip(" ")
            try:
                gender = Sex.find_sex(name)
            except SexException:
                self.metadata_collector.add_error_record('genderNotFound', 2)
                gender = None

            try:
                year_match = regex_utils.safe_search(self.YEAR_PATTERN, child,
                                                     self.CHILD_OPTIONS)
                year = year_match.group("year")
                if float(year) < 70:
                    year = text_utils.int_or_none("19" + year)
                else:
                    year = text_utils.int_or_none("18" + year)
            except regex_utils.RegexNoneMatchException:
                year = None

            return {
                "name": name,
                "gender": gender,
                "birthYear": year,
                "kairaId": self._kaira_id_provider.get_new_id('C')
            }
        except regex_utils.RegexNoneMatchException:
            pass
    def _find_profession(self, text, start_position):
        text = text_utils.take_sub_str_based_on_range(text, start_position,
                                                      self.SEARCH_SPACE)
        cursor_location = 0
        profession = None

        try:
            # limit the search range if there is spouse keyword:
            try:
                found_spouse_word = regexUtils.safe_search(
                    r"Puol", text, self.PROFESSION_OPTIONS)
                text = text_utils.take_sub_str_based_on_range(
                    text, 0, found_spouse_word.start())
            except regexUtils.RegexNoneMatchException as e:
                pass

            found_profession_match = regexUtils.safe_search(
                self.PROFESSION_PATTERN, text, self.PROFESSION_OPTIONS)

            cursor_location = found_profession_match.end()
            profession = found_profession_match.group("profession")
        except regexUtils.RegexNoneMatchException as e:
            pass

        result_profession = self._clean_professions(profession)

        if result_profession is None:
            self.metadata_collector.add_error_record('professionNotFound', 4)

        return result_profession, cursor_location
    def _find_omakotitalo(self, text):
        try:
            regexUtils.safe_search(self.OMAKOTITALO_PATTERN, text,
                                   self.OMAKOTITALO_OPTIONS)
            return True
        except regexUtils.RegexNoneMatchException:
            pass

        return False
 def _find_patterns(self, text):
     results = {}
     for key, pattern in self.patterns_to_find.items():
         try:
             regexUtils.safe_search(pattern, text, self.OPTIONS)
             results[key] = True
         except regexUtils.RegexNoneMatchException:
             results[key] = False
             pass
     return results
    def _process_child(self, child, child_list):
        birth_loc = regex_utils.search("syntyneet{s<=1}\s(?P<location>\w*)",
                                       child, self.CHILD_OPTIONS)
        if birth_loc is not None:
            # found a "Syntyneet <place>" string. Set it to the previous children.
            for c in child_list:
                if c[KEYS["childLocationName"]] == "":
                    c[KEYS["childLocationName"]] = birth_loc.group("location")
            raise StopChildExtractionException(
                'Child extraction should be stopped here. Current child is not valid child.'
            )

        name = regex_utils.safe_search(self.NAME_PATTERN, child,
                                       self.CHILD_OPTIONS).group("name")
        name = name.strip()
        name = name.strip("-")
        name = name.strip(" ")

        try:
            gender = Sex.find_sex(name)
        except SexException:
            self.metadata_collector.add_error_record('genderNotFound', 2)
            gender = None

        try:
            year_match = regex_utils.safe_search(self.YEAR_PATTERN, child,
                                                 self.CHILD_OPTIONS)
            year = year_match.group("year")
            if float(year) < 70:
                year = "19" + year
            else:
                year = "18" + year
        except regex_utils.RegexNoneMatchException:
            year = ""

        try:
            loc_match = regex_utils.safe_search(self.LOCATION_PATTERN, child,
                                                self.CHILD_OPTIONS)
            location = loc_match.group("location")
            location = location.strip()
            location = location.strip("-")
        except regex_utils.RegexNoneMatchException:
            location = ""

        return {
            KEYS["childName"]: name,
            KEYS["gender"]: gender,
            KEYS["birthYear"]: text_utils.int_or_none(year),
            KEYS["childLocationName"]: location,
            KEYS["kairaId"]: self._kaira_id_provider.get_new_id('C')
        }
Example #6
0
    def _find_patterns(self, text):
        results = {}
        for key, pattern in self.patterns_to_find.items():
            try:
                usepattern = self.QUANTITY_PATTERN + pattern
                found = regexUtils.safe_search(usepattern, text, self.OPTIONS)
                results[key] = self._process_value(found)
            except regexUtils.RegexNoneMatchException:
                try:
                    usepattern = pattern + self.QUANTITY_PATTERN
                    found = regexUtils.safe_search(usepattern, text, self.OPTIONS)
                    results[key] = self._process_value(found)
                except regexUtils.RegexNoneMatchException:
                    results[key] = None

        return results
    def _find_owner_name_details(self, text, start_position):
        cursor_location = start_position
        owner_name_data = ('', '', '')
        try:
            owner_name_match = regexUtils.safe_search(self.OWNER_NAME_PATTERN,
                                                      text, self.OWNER_OPTIONS)
            cursor_location = start_position + owner_name_match.end()
            owner_name_data = self._split_names(owner_name_match.group("name"))
        except regexUtils.RegexNoneMatchException:
            self.metadata_collector.add_error_record('ownerNameNotFound', 7)

        return owner_name_data, cursor_location
    def _find_owner_year(self, text, start_position):
        cursor_location = start_position
        owner_year = None
        try:
            owner_year = regexUtils.safe_search(self.OWNER_YEAR_PATTERN, text,
                                                self.OWNER_OPTIONS)
            cursor_location = start_position + owner_year.end()
            owner_year = text_utils.int_or_none(owner_year.group("year"))
        except regexUtils.RegexNoneMatchException:
            self.metadata_collector.add_error_record('ownerYearNotFound', 2)

        return owner_year, cursor_location
    def _find_date(self, text, start_position):
        cursor_location = start_position
        try:
            wedding = regex_utils.safe_search(self.PATTERN, text, self.OPTIONS)

            # Dirty fix for inaccuracy in positions which would screw the Location extraction
            cursor_location = wedding.end() + start_position - 4
            wedding_year = text_utils.int_or_none("19" + wedding.group("year"))
        except regex_utils.RegexNoneMatchException:
            wedding_year = None

        return wedding_year, cursor_location
Example #10
0
    def _find_family(self, text, start_position):
        text = text_utils.take_sub_str_based_on_pos(text, start_position, self.SEARCH_SPACE)
        cursor_location = 0
        own_family = None

        try:
            found_family_match = regexUtils.safe_search(self.FAMILY_PATTERN, text, self.FAMILY_OPTIONS)
            cursor_location = found_family_match.end()
            own_family = found_family_match.group("family")
        except regexUtils.RegexNoneMatchException as e:
            pass

        return own_family, cursor_location
 def find_location(self, text):
     """
     Note: Returns match-object for caller instead of string.
     :param text:
     :return:
     """
     try:
         found_location_match = regex_utils.safe_search(
             self.PATTERN, text, self.OPTIONS)
         cursor_location = found_location_match.end()
         return found_location_match, cursor_location
     except regex_utils.RegexNoneMatchException:
         raise LocationException(text)
Example #12
0
    def _get_area(self, text, pattern):
        area = None
        try:
            found_area = regexUtils.safe_search(pattern, text,
                                                self.AREA_OPTIONS)
            if found_area.group("area1") is not None:
                area = found_area.group("area1")
            elif found_area.group("area2") is not None:
                area = found_area.group("area2")
        except regexUtils.RegexNoneMatchException:
            pass

        return area
 def _find_location_match(self, text):
     """
     Note: Returns match-object for caller instead of string.
     :param text:
     :return:
     """
     pattern = r'(?:\d+| s)(?:\s|,|\.)(?P<location>[A-ZÄ-Ö]{1,1}[A-ZÄ-Öa-zä-ö-]{1,}(?: mlk)?)'
     try:
         found_location_match = regex_utils.safe_search(
             pattern, text, re.UNICODE)
         cursor_location = found_location_match.end()
         return found_location_match, cursor_location
     except regex_utils.RegexNoneMatchException:
         raise LocationException(text)
Example #14
0
    def _find_date(self, text):
        try:
            found_date_matches = regex_utils.safe_search(self.PATTERN, text, self.OPTIONS)
            months_and_years_from_words = self._if_written_month_names_extract_them(found_date_matches)
            cursor_location = found_date_matches.end()
            if months_and_years_from_words is None:
                year = self._get_year_from_match(found_date_matches)
                day_and_month = self._get_month_and_day_from_match(found_date_matches)

                return {'day': day_and_month['day'], 'month': day_and_month['month'], 'year': year}, cursor_location
            else:
                return {'day': '', 'month': months_and_years_from_words[0], 'year': months_and_years_from_words[1]}, cursor_location
        except regex_utils.RegexNoneMatchException:
            raise DateException(text)
    def _find_children(self, text):
        children = []
        cursor_location = 0
        try:
            found_children = regex_utils.safe_search(self.CHILD_PATTERN, text,
                                                     self.CHILD_OPTIONS)
            cursor_location = found_children.end()
            children_str = found_children.group("children")
            children_str = self._clean_children(children_str)
            children = self._split_children(children_str)

        except regex_utils.RegexNoneMatchException:
            self.metadata_collector.add_error_record('childrenNotFound', 5)

        return children, cursor_location
Example #16
0
    def _get_area(self, text, pattern):
        area = None
        cursor_location = 0
        try:
            found_area_match = regexUtils.safe_search(pattern, text,
                                                      self.AREA_OPTIONS)
            cursor_location = found_area_match.end()
            if found_area_match.group("area1") is not None:
                area = found_area_match.group("area1")
            elif found_area_match.group("area2") is not None:
                area = found_area_match.group("area2")
        except regexUtils.RegexNoneMatchException:
            pass

        return area, cursor_location
    def _find_spouse(self, text, start_position):
        cursor_location = start_position
        spouse_data = None

        try:
            found_spouse_match = regex_utils.safe_search(
                self.PATTERN, text, self.OPTIONS)
            spouse_data = self._find_spouse_data(
                found_spouse_match.group("spousedata"))

            # Dirty fix for inaccuracy in positions which would screw the Location extraction
            cursor_location = found_spouse_match.end() + start_position - 4
        except regex_utils.RegexNoneMatchException:
            pass

        return spouse_data, cursor_location
Example #18
0
    def _find_children(self, text, start_position):
        cursor_location = start_position
        text = re.sub(r"sekä", ",", text)
        children_entries = []

        try:
            found_children_match = regex_utils.safe_search(
                self.CHILD_PATTERN, text, self.CHILD_OPTIONS)
            cursor_location = found_children_match.end()
            children_str = found_children_match.group("children")
            cleaned_children = self._clean_children(children_str)
            children_entries = self._split_children(cleaned_children)

        except regex_utils.RegexNoneMatchException:
            self.metadata_collector.add_error_record('childrenNotFound', 5)

        return children_entries, cursor_location
    def _find_spouse_data(self, text):
        try:
            name = regex_utils.safe_search(self.NAMEPATTERN, text,
                                           self.OPTIONS)
            spouse_name = name.group("name").strip()
            spouse_name = re.sub(r"\so$", "", spouse_name)
            spouse_details, metadata = self._find_spouse_details(
                text[name.end() - 2:])

            # Map data to spouse object
            return {
                KEYS["spouseBirthData"]: {
                    **spouse_details['birthday']
                },
                KEYS["formerSurname"]: spouse_details[KEYS['formerSurname']],
                KEYS["spouseName"]: spouse_name,
                KEYS["kairaId"]: self.kaira_id_provider.get_new_id('S')
            }

        except regex_utils.RegexNoneMatchException:
            self.metadata_collector.add_error_record('spouseNotFound', 7)
    def _find_spouse_data(self, sub_text, entry):
        spouse_name = ''
        spouse_details = None

        try:
            spouse_name_match = regex_utils.safe_search(
                self.NAMEPATTERN, sub_text, self.OPTIONS)
            spouse_name = spouse_name_match.group('name').strip()
            spouse_name = re.sub(r'\so$', '', spouse_name)
            spouse_details, metadata = self._find_spouse_details(
                sub_text[spouse_name_match.end() - 2:], entry['full_text'])
            spouse_details = spouse_details['spouse']

            return {
                **spouse_details,
                KEYS['spouseName']: spouse_name,
                KEYS['hasSpouse']: True,
                KEYS['kairaId']: self.kaira_id_provider.get_new_id('S'),
            }
        except regex_utils.RegexNoneMatchException:
            self.metadata_collector.add_error_record('spouseNotFound', 6)

        return spouse_name, spouse_details
    def _find_locations(self, text):
        # Replace all weird invisible white space characters with regular space
        text = re.sub(r'\s', r' ', text)

        cursor_location = 0
        location_entries = []

        def _get_location_entries(parsed_location):
            village_information = None
            location_records = []

            # Parsed result set may countain municipality and village information. If only one result is in the
            # result set, interpret it as municipality
            if 'municipality' in parsed_location:
                # Try to normalize place names first so that the coordinate fetch from DB might work better
                entry_name, entry_region = place_name_cleaner.try_to_normalize_place_name_with_known_aliases(
                    parsed_location['municipality'], return_region=True)
                village_information = self._get_village(parsed_location)

            else:
                entry_name, entry_region = place_name_cleaner.try_to_normalize_place_name_with_known_aliases(
                    parsed_location['place'], return_region=True)

            geocoordinates = get_coordinates_by_name(entry_name)

            entry_name = validate_location_name(entry_name, geocoordinates)

            # If region was in db associated to coordinates, override previously set region with it
            if 'region' in geocoordinates:
                entry_region = geocoordinates['region']

            if 'year_information' in parsed_location:
                for migration in parsed_location['year_information']:
                    if 'moved_in' in migration:
                        moved_in = text_utils.int_or_none(
                            migration['moved_in'])
                    else:
                        moved_in = None

                    if 'moved_out' in migration:
                        moved_out = text_utils.int_or_none(
                            migration['moved_out'])
                    else:
                        moved_out = None

                    location_records.append(
                        # FIXME: Refactor this to the _postprocess method?
                        place_name_cleaner.clean_place_name(
                            self._get_location_entry(entry_name, entry_region,
                                                     geocoordinates,
                                                     village_information,
                                                     moved_in, moved_out)))
            else:
                location_records.append(
                    # FIXME: Refactor this to the _postprocess method?
                    place_name_cleaner.clean_place_name(
                        self._get_location_entry(entry_name, entry_region,
                                                 geocoordinates,
                                                 village_information)))

            return location_records

        try:
            found_locations = regex_utils.safe_search(self.LOCATION_PATTERN,
                                                      text,
                                                      self.LOCATION_OPTIONS)
            cursor_location = found_locations.end()
            locations = found_locations.group('asuinpaikat')
            locations = clean_locations(locations)

            # Parse location string with BNF parser
            parsed_locations = migration_parser.parse_locations(locations)

            try:
                for loc in parsed_locations:
                    location_entries += _get_location_entries(loc)
            except InvalidLocationException:
                pass
        except regex_utils.RegexNoneMatchException:
            self.metadata_collector.add_error_record('otherLocationNotFound',
                                                     5)

        return location_entries, cursor_location
    def _find_hostess_name(self, text):
        hostess_name_match = regexUtils.safe_search(self.HOSTESS_NAME_PATTERN, text, self.HOSTESS_OPTIONS)
        cursor_location = hostess_name_match.end()
        hostess_name = self._split_names(hostess_name_match.group("name"))

        return hostess_name, cursor_location