def get_office_strings(self, sha256):
        js = self.retrieve_json_by_sha256(sha256)

        if js is None:
            return {'smart_parser_data_not_found': 1}

        rec = {'title': "", 'roles': [], 'departments': []}

        props = js.get('document_sheet_props', [])
        if len(props) > 0 and props[0].get('sheet_title') is not None:
            rec['title'] = normalize_whitespace(props[0]['sheet_title'])
        _, rec['title'] = TPredictionCase.truncate_title(rec['title'])
        roles = set()
        departments = set()
        for p in js.get('persons', []):
            role = p.get('person', {}).get('role')
            if role is not None and len(role) > 0 and len(roles) < 10:
                roles.add(normalize_whitespace(role))
            department = p.get('person', {}).get('department')
            if department is not None and len(department) > 0 and len(
                    departments) < 10:
                departments.add(normalize_whitespace(department))
        rec['roles'] = list(roles)
        rec['departments'] = list(departments)
        return rec
Exemple #2
0
def get_regions(inp, start_after_rf_line=True):
    regions = TRussianRegions()
    start = not start_after_rf_line
    used_regions = set()
    for line in inp:
        if not start:
            if line.find("Российская Федерация") != -1:
                start = True
            continue
        found_stop_word = False
        for stop_word in [
                'среднедушевые', 'рублей', '2016 год', 'i квартал',
                'российская федерация', 'федеральный округ', 'наименование',
                'в том числе', ' без ', '(кроме ',
                'ямало-ненецкий автономный округ', 'предварительные', 'начиная'
        ]:
            if line.lower().find(stop_word) != -1:
                found_stop_word = True
                break
        if found_stop_word:
            continue
        cols = line.split("\t")
        region_str = cols[0].strip().replace('H', 'Н')
        region_str = normalize_whitespace(region_str)
        region = regions.get_region_in_nominative(region_str)
        if region is None:
            raise Exception("cannot find region {}".format(region_str))
        used_regions.add(region.id)
        yield region, cols[1:]
    for r in regions.iterate_inner_regions_without_joined():
        if r.id not in used_regions:
            raise Exception(
                "region {}, id={} is not found in the input file".format(
                    r.name, r.id))
def convert_vehicle(name):
    name = unidecode(name)
    name = normalize_whitespace(name)
    name = name.replace(" ", "")
    name = name.replace("-", "")
    name = name.lower()
    return name
 def get_region_in_nominative_and_dative(self, russian_name):
     russian_name = normalize_whitespace(russian_name.strip().lower())
     region: TRegion
     for region in self.regions:
         for region_in_dative in region.dative_forms:
             if russian_name.endswith(region_in_dative):
                 return self._region_id_to_region[region.id]
     return self.get_region_in_nominative(russian_name)
 def get_region_all_forms(self, text, unknown_region=None):
     text = normalize_whitespace(text.strip().lower())
     best_region_id = unknown_region
     max_form_len = 0
     for pos, (region_id, form) in self.all_forms.iter(text):
         if len(form) > max_form_len:
             best_region_id = region_id
             max_form_len = len(form)
     return best_region_id
Exemple #6
0
 def get_abridged_normalized_person_name(self):
     if not self.is_resolved:
         return normalize_whitespace(self.input_person_name).lower()
     s = self.family_name.title()
     if len(self.first_name) >= 1:
         s += " " + self.first_name[0].upper() + "."
     if len(self.patronymic) >= 1:
         s += " " + self.patronymic[0].upper() + "."
     return s
Exemple #7
0
def normalize_fio_before_db_insert(fio):
    fio = normalize_whitespace(fio)
    fio = fio.replace('"', ' ').strip()
    fio = fio.strip('-')
    if len(fio) > 0 and fio[0].isdigit():
        while len(fio) > 0 and (fio[0].isdigit() or fio[0] == ' '
                                or fio[0] == '.'):
            fio = fio[1:]
    return fio.title()
 def get_title_from_smart_parser_json(js):
     default_value = "null"
     if js is None:
         return default_value
     else:
         props = js.get('document_sheet_props', [])
         if len(props) < 1:
             return default_value
         return normalize_whitespace(props[0].get('sheet_title',
                                                  default_value))
Exemple #9
0
 def __init__(self, title):
     self.title = normalize_whitespace(title)
     self.input_title = title
     self.type = None
     self.decl_objects = list()
     self.decl_time = None
     self.declarant_positions = list()
     self.fio = None
     self.regions = TRussianRegions()
     self.region_name = None
     self.org_name = None
 def get_region_using_automat(self, automat, text, unknown_region=None):
     text = normalize_whitespace(text.strip().lower().replace(',', ' '))
     for last_pos, (region_id, form) in automat.iter(text):
         start_pos = last_pos + 1 - len(form)
         if start_pos > 0:
             if text[start_pos - 1].isalnum():
                 continue
         if last_pos + 1 < len(text):
             if text[last_pos + 1].isalnum():
                 continue
         return region_id, start_pos, last_pos + 1
     return unknown_region, None, None
Exemple #11
0
 def get_normalized_person_name(self):
     if self.is_resolved:
         s = self.family_name.title() + " " + self.first_name.title()
         if self.first_name_is_abridged:
             s += '.'
         if len(self.patronymic) > 0:
             s += " " + self.patronymic.title()
             if self.patronymic_is_abridged:
                 s += '.'
         return s
     else:
         return normalize_whitespace(self.input_person_name).lower()
 def _build_automatons_without_russia(self):
     self.nominative_forms = ahocorasick.Automaton()
     self.all_capitals = ahocorasick.Automaton()
     for r in self.regions:
         if r.id == TRussianRegions.Russia_as_s_whole_region_id:
             continue
         for f in set([r.name, r.short_name]):
             self.nominative_forms.add_word(f.lower(), (r.id, f.lower()))
         capital = normalize_whitespace(r.capital.lower())
         self.capitals_to_regions[capital] = r
         self.all_capitals.add_word(capital, (r.id, capital))
         if capital.find('ё') != -1:
             capital = capital.replace("ё", "е")
             self.capitals_to_regions[capital] = r
             self.all_capitals.add_word(capital, (r.id, capital))
     self.all_capitals.make_automaton()
     self.nominative_forms.make_automaton()
 def clean_title(title):
     if title is None:
         return ""
     title = normalize_whitespace(title)
     regexp = "(главная страница)|(Новости)|(Добро пожаловать!)" \
             "|(Добро пожаловать)|(None)|(title)|(Title)|(Portal)|(main)|(Срок регистрации домена закончился)" \
             "|(403 Forbidden)|(Главная страница сайта)|(Срок действия тарифа истек)" \
             "|(Добро пожаловать на сайт)|(Главная страница)|(Main)|(на сайт —)|(Общая информация \|)"
     title = re.sub(regexp, "", title, re.IGNORECASE).strip(" |:-—")
     if title.lower() in {
             "главная", "новости", "основные сведения", "управление",
             "официальный сайт", "сайт школы", "администрация",
             "идет проверка..."
     }:
         return ""
     title = re.sub("Главная ([|-])", "", title,
                    re.IGNORECASE).strip(" |:-")
     return title
Exemple #14
0
 def __init__(self, source_file, input_text):
     self.source_file = source_file
     self.verdict = DL_RECOGNIZER_ENUM.UNKNOWN
     if self.source_file is not None:
         self.smart_parser_person_count, self.smart_parser_title = get_smart_parser_result(
             self.source_file)
     else:
         self.smart_parser_person_count, self.smart_parser_title = 0, ""
     input_text = normalize_whitespace(input_text)
     self.start_text = input_text[0:500]
     self.input_text = input_text
     self.text_features = defaultdict(TTextFeature)
     self.description = ""
     self.normal_russian_text_coef = get_russian_normal_text_ratio(
         self.input_text)
     self.find_other_document_types()
     self.find_header()
     self.find_other_document_types_in_smart_parser_title()
def all_positions_words(section):
    if section.position is None:
        return set()
    s = normalize_whitespace(section.position).lower()
    return set(s.split(' '))
Exemple #16
0
 def normalize_string(s):
     return normalize_whitespace(s.lower())