def __parse_address(self, city, address): address = address.replace(u'п/о', u'пос.') if city in self.__full_settlement_names: if city not in address: return split_address_place(u'%s, %s' % (city, address)) return split_address_place(address) return split_address_place(u'г. %s, %s' % (city, address))
def __parse_base_office_exchange(self, item, map_points, point_type, start_names): point = Point() point.prov = self.uid point.type = point_type point.name = normalize_text(item('.name').text()) if not point.name.startswith(start_names): return None point.address, point.place = split_address_place(item('.addres strong').text()) sub_item = item('.item_block tr:last') point.phones = normalize_phones(sub_item('td:eq(0)').text().split(',')) mon_thu = u'пн-чт: ' + sub_item('td:eq(2)').text() fri = u'пт: ' + sub_item('td:eq(3)').text() sat = u'сб: ' + sub_item('td:eq(4)').text() sun = u'вс: ' + sub_item('td:eq(5)').text() point.time = normalize_time(', '.join([mon_thu, fri, sat, sun])) point.check_information = CHECK_OFFICIAL for lng, lat, name, address, place in map_points: if (point.name in name if point.name and name else True) and\ (point.address and address and point.address in address) and\ (point.place in place if point.place and place else True): point.lat = lat point.lng = lng point.check_coordinates = CHECK_OFFICIAL break else: warning_not_official_coordinates(point) return point
def __parse_terminal(self, item): point = Point() point.prov = self.uid point.type = TYPE_TERMINAL city = normalize_text(item('td:eq(0)').text()) address = normalize_text(item('td:eq(2)').text()) point.address, point.place = split_address_place(u'г. %s, %s' % (city.title(), address)) point.place = normalize_text(item('td:eq(1)').text()) point.time = normalize_time(item('td:eq(3)').text()) point.check_information = CHECK_OFFICIAL for lat, lng, type_id, description in self.__get_coordinates(): if u'Минск' not in point.address or type_id != '2': continue for token in description.split(): if token not in point.address: break else: point.lat = lat point.lng = lng point.check_coordinates = CHECK_OFFICIAL break else: warning_not_official_coordinates(point) return point
def __parse_base_office_exchange(self, item, point_type, name_keywords): point = Point() point.prov = self.uid point.type = point_type point.name = normalize_text(item('th:eq(0) a:eq(0)').text()) if not point.name.startswith(name_keywords): return None city = normalize_text(item('td:eq(1)').text()) address = normalize_text(item('td:eq(2)').text()) point.address, point.place = split_address_place(u'г. %s, %s' % (city, address)) point.check_information = CHECK_OFFICIAL for lat, lng, type_id, description in self.__get_coordinates(): if u'Минск' not in point.address or type_id != '1': continue for token in description.split(): if token not in point.address and token not in point.name: break else: point.lat = lat point.lng = lng point.check_coordinates = CHECK_OFFICIAL break else: warning_not_official_coordinates(point) return point
def __parse_base(self, item, city, point_type): point = Point() point.prov = self.uid point.type = point_type point.name = normalize_text(item('.b-map-side>h5').text()) point.address, point.place = split_address_place(u'г. %s, %s' % (city, item('.b-map-side>p span:eq(0)').text())) coordinates = item('.b-map-side>p span:eq(1)').text() if coordinates: point.lat, point.lng = map(strip, coordinates.split(',')) text_html = replace_br(item('.b-map-side-more').html(), ';;;') time_items = [] for sub_item in map(normalize_text, PQ(text_html).text().split(';;;')): if not sub_item: continue if sub_item.startswith(u'Телефон:'): point.phones = normalize_phones(sub_item[len(u'Телефон:')].split(',')) continue time_items.append(sub_item) point.time = normalize_time(', '.join(time_items)) point.check_information = CHECK_OFFICIAL if point.lat and point.lng: point.check_coordinates = CHECK_OFFICIAL else: warning_not_official_coordinates(point) return point
def get_offices(self): points = [] items_tree = ET.fromstring(get_url(self.__offices_xml_url)) for item in items_tree.iter('item'): point = self.__parse_office(item) if point: points.append(point) page = PQ(get_url(self.__regional_offices_page_url)) point = None for item in map(PQ, page('#content_internal span:eq(0)').children()): if item[0].tag not in self.__regional_offices_tags: continue if item[0].tag == 'h2': point = Point() point.prov = self.uid point.type = TYPE_OFFICE point.name = trim_spaces_and_commas(normalize_text(item.text())) point.check_information = CHECK_OFFICIAL continue if not point: continue item_html = replace_br(item.html(), ';;;') sub_items = PQ(item_html).text().split(';;;') point.address, point.place = split_address_place(sub_items[0]) for sub_item in map(normalize_text, sub_items[1:]): if sub_item.startswith(u'т.ф.:'): point.phone = normalize_phones(sub_item[len(u'т.ф.:'):].split(',')) warning_not_official_coordinates(point) points.append(point) point = None return points
def __parse_base_office_exchange(self, item): point = Point() point.prov = self.uid point.name = normalize_text(item('td:eq(1)').text()) point.address, point.place = split_address_place(item('td:eq(2)').text()) point.time = normalize_time(item('td:eq(3)').text()) point.phones = normalize_phones(item('td:eq(4)').text().split(',')) point.check_information = CHECK_OFFICIAL return point
def __parse_terminal(self, item): point = Point() point.prov = self.uid point.type = TYPE_TERMINAL point.address, point.place = split_address_place(item('td:eq(1)').text()) point.time = normalize_time(item('td:eq(2)').text()) point.deposit = u'Пополнение карточки наличными' in item('td:eq(3)').text() point.check_information = CHECK_OFFICIAL warning_not_official_coordinates(point) return point
def __parse_atm(self, item): point = Point() point.prov = self.uid point.type = TYPE_ATM point.address, point.place = split_address_place(item('td:eq(1)').text()) point.time = normalize_time(item('td:eq(2)').text()) point.currency = map(strip, item('td:eq(3)').text().split(',')) point.check_information = CHECK_OFFICIAL warning_not_official_coordinates(point) return point
def __parse_exchange(self, item): point = Point() point.prov = self.uid point.type = TYPE_EXCHANGE sub_items = item.text().split(u'—') point.name = normalize_text(sub_items[0]) point.address, point.place = split_address_place(sub_items[1]) point.check_information = CHECK_OFFICIAL warning_not_official_coordinates(point) return point
def __parse_terminal(self, item): point = Point() point.prov = self.uid point.type = TYPE_TERMINAL point.name = normalize_text(item('td:eq(0)').text()) point.address, point.place = split_address_place(item('td:eq(1)').text()) point.place = point.name point.time = normalize_time(item('td:eq(2)').text()) point.deposit = normalize_text(item('td:eq(3)').text()).lower() == u'есть' point.check_information = CHECK_OFFICIAL warning_not_official_coordinates(point) return point
def __parse_exchange(self, item, city): point = Point() point.prov = self.uid point.type = TYPE_EXCHANGE point.name = normalize_text(item('td:eq(0)').text()) point.address, point.place = split_address_place(u'г. %s, %s' % (city, item('td:eq(1)').text())) if len(item('td')) == 4: point.time = normalize_time(item('td:eq(2)').text()) else: point.time = normalize_time(item('td:eq(2)').text().split(u'Операции:')[0]) point.check_information = CHECK_OFFICIAL warning_not_official_coordinates(point) return point
def __get_map_points(self, template, id, is_office): url = template.format(id) if url not in self.__map_points: page = PQ(get_url(url).decode('cp1251')) map_points = [] is_place = False lat, lng = None, None name, address, place = None, None, None start_place_token = 'var placemark = new YMaps.Placemark(new YMaps.GeoPoint(' end_place_token = 'map.addOverlay(placemark);' start_description_token = 'placemark.description = "' end_description_token = '";' for line in map(strip, page('script').text().splitlines()): if is_place: if line.startswith(end_place_token): map_points.append((lat, lng, name, address, place,)) is_place = False lat, lng = None, None name, address, place = None, None, None continue if line.startswith(start_description_token): description = line[len(start_description_token):-len(end_description_token)] info_page = PQ(description) if is_office: name = normalize_text(info_page('h1').text()) address, place = split_address_place(info_page('p:eq(0)').text()) else: name = info_page('h1').text().strip().split()[0] address, place = split_address_place(' '.join(info_page('h1').text().strip().split()[1:])) place = normalize_text(info_page('p:eq(0)').text()) continue elif line.startswith(start_place_token): is_place = True lat, lng = map(strip, line[len(start_place_token):line.find(')', len(start_place_token))].split(',')) continue self.__map_points[url] = map_points return self.__map_points[url]
def __parse_terminal(self, item): point = Point() point.prov = self.uid point.type = TYPE_TERMINAL point.address, point.place = split_address_place(item('td:eq(2)').text()) point.place = normalize_text(item('td:eq(1)').text()) point.currency = map(strip, item('td:eq(4)').text().split(',')) if point.currency: point.deposit = True else: point.deposit = False point.time = normalize_time(item('td:eq(3)').text()) point.check_information = CHECK_OFFICIAL warning_not_official_coordinates(point) return point
def __parse_office(self, item): point = Point() point.prov = self.uid point.type = TYPE_OFFICE point.name = normalize_text(item('td:eq(0)').text()) address_items = item('td:eq(1)').text().split(u'тел.') address = address_items[0] point.address, point.place = split_address_place(address) if len(address_items) > 1: phone = address_items[1] phones_items = phone.split(u'доб') point.phones = normalize_phones(phones_items[0].split(',')) point.time = self.__parse_time(item) point.check_information = CHECK_OFFICIAL warning_not_official_coordinates(point) return point
def __parse_office(self, item): point = Point() point.prov = self.uid point.type = TYPE_OFFICE point.name = normalize_text(item('h1').text()) point.address, point.place = split_address_place(item('tr:eq(2) td:eq(1)').text()) phones = [] phone_html = replace_br(item('tr:eq(5) td:eq(1)').html(), ';;;') if phone_html: phones += map(strip, PQ(phone_html).text().split(';;;')) phone_html = replace_br(item('tr:eq(6) td:eq(1)').html(), ';;;') if phone_html: phones += map(strip, PQ(phone_html).text().split(';;;')) point.phones = normalize_phones(filter(lambda phone: phone.startswith((u'+', u'тел')), phones)) point.time = normalize_time(item('tr:eq(8) td:eq(1)').text()) point.check_information = CHECK_OFFICIAL warning_not_official_coordinates(point) return point
def __parse_office(self, item, city): point = Point() point.prov = self.uid point.type = TYPE_OFFICE point.name = normalize_text(item('th .pointShowMaps span:eq(0)').text()) address = item('th .pointShowMaps span:eq(1)').text() point.address, point.place = split_address_place(u'г. %s, %s' % (city, address)) time_html = replace_br(item('td:eq(0)').html(), ', ') point.time = normalize_time(PQ(time_html).text()) phones_html = replace_br(item('td:eq(1)').html(), ', ') point.phones = normalize_phones(PQ(phones_html).text().split(',')) point.lat = normalize_text(item('th .item_coords .coord1').text()) point.lng = normalize_text(item('th .item_coords .coord2').text()) point.check_information = CHECK_OFFICIAL if point.lat and point.lng: point.check_coordinates = CHECK_OFFICIAL else: warning_not_official_coordinates(point) return point
def __parse_base_atm_terminals(self, item, map_points, point_type, start_names): point = Point() point.prov = self.uid point.type = point_type if not item('.name').text().split()[0].startswith(start_names): return None point.address, point.place = split_address_place(' '.join(item('.name').text().strip().split()[1:])) point.place = trim_spaces_and_commas(normalize_text(item('.addres strong').text())) point.check_information = CHECK_OFFICIAL for lat, lng, name, address, place in map_points: if (name in start_names) and\ (point.address and address and point.address in address) and\ (point.place in place if point.place and place else True): point.lat = lat point.lng = lng point.check_coordinates = CHECK_OFFICIAL break else: warning_not_official_coordinates(point) return point
def __parse_base_offices_exchanges(self, item, point_type, keywords_names): point = Point() point.prov = self.uid point.type = point_type point.name = normalize_text(item('.first').text()) if not point.name.startswith(keywords_names): return None city = item('.field-field-city').text() if city: city = u'г. ' + city else: city = item('.field-field-index').text() address = item('.field-field-adress').text() point.address, point.place = split_address_place(u'%s, %s' % (city, address)) phone = item('.field-field-phone').text() if phone: point.phones = [normalize_phone(phone[len(u'тел.:')])] point.time = normalize_time(item('.field-field-work-time .field-item').text()) point.check_information = CHECK_OFFICIAL warning_not_official_coordinates(point) return point
def __parse_atm(self, item, city): point = Point() point.prov = self.uid point.type = TYPE_ATM address = item('th .pointShowMaps span').remove().text() place = normalize_text(item('th .pointShowMaps').text()) point.address, point.place = split_address_place(u'г. %s, %s' % (city, address)) point.place = place currency = item('td:eq(0)').text() for from_token, to_token in self.__currency_replaces: currency = currency.replace(from_token, to_token) point.currency = map(strip, currency.split(',')) time_html = replace_br(item('td:eq(1)').html(), ', ') point.time = normalize_time(PQ(time_html).text()) point.lat = normalize_text(item('th .item_coords .coord1').text()) point.lng = normalize_text(item('th .item_coords .coord2').text()) point.check_information = CHECK_OFFICIAL if point.lat and point.lng: point.check_coordinates = CHECK_OFFICIAL else: warning_not_official_coordinates(point) return point
def __get_address(self, city_name, short_address): return split_address_place(u'г. %s, %s' % (city_name, short_address))
def __parse_address(self, item): town = normalize_text(item("td:eq(2)").text()) short_address = normalize_text(item("td:eq(3)").text()) return split_address_place(u"г. %s, %s" % (town, short_address))
def get_offices(self): points = [] coordinates = self.__get_coordinates() regions_page = PQ(get_url(self.__regions_url).decode('cp1251')) for region_item in map(PQ, regions_page('#ctl47_Panel_Viewer .rsf-content-menu a')): region_url = self.site + region_item.attr('href') offices_page = PQ(get_url(region_url).decode('cp1251')) for item in map(PQ, offices_page('#Print_2_ctl39 table')): if not normalize_text(item.text()): continue is_main = u'Центральный офис' in item('tr:eq(0)').text() offices_url = self.site + '/' + item('tr:eq(2) a').attr('href') page = PQ(get_url(offices_url).decode('cp1251')) if is_main: point = self.__parse_office_main(coordinates) if point: points.append(point) else: point = None start_times = False time_items = [] for item in map(PQ, page('#Print_2_ctl39 tr:gt(1)')): if len(item('td')) >= 4: if point: point.time = normalize_time(', '.join(time_items)) point.lat, point.lng = self.__get_point_coordinate(point.address, coordinates) if point.lat and point.lng: point.check_coordinates = CHECK_OFFICIAL else: warning_not_official_coordinates(point) points.append(point) point = None start_times = False time_items = [] name = normalize_text(item('td:eq(0)').text()) if not name: continue point = Point() point.prov = self.uid point.type = TYPE_OFFICE point.check_information = CHECK_OFFICIAL point.name = u'№%s' % name address_html = item('td:eq(1) p:eq(0)').html() if address_html: address_html = address_html.strip() if not address_html: address_html = item('td:eq(1)').html() image_tag_start = address_html.find('<img') if image_tag_start > 0: address_html = address_html[:image_tag_start] atm_text_start = address_html.find(u'Банкомат') if atm_text_start > 0: address_html = address_html[:atm_text_start] point.address, point.place = split_address_place(PQ(address_html).text().split(';;;')[0]) if point.address.endswith(u'А'): point.address = point.address[:-1] + u'а' if point.address.endswith(u'Б'): point.address = point.address[:-1] + u'б' if point.address.endswith(u'-а'): point.address = point.address[:-2] + u'а' for from_token, to_token in self.__address_replaces: point.address = point.address.replace(from_token, to_token) item('td:eq(0), td:eq(1)').remove() next_sub_item = normalize_text(item('td:eq(0)').text()).lower() if not next_sub_item.startswith(self.__start_offices_keywords) and\ not next_sub_item.startswith(self.__stop_offices_keywords): start_times = True if start_times and len(item('td')) >= 2: time_items.append(u'%s: %s' % (item('td:eq(0)').text(), item('td:eq(1)').text())) point.phones = self.__parse_phones(item) if not point: continue item_text = normalize_text(item.text()) if not item_text or item_text.startswith(u'г.'): continue for sub_item in map(PQ, item('td')): if not normalize_text(sub_item.text()): sub_item.remove() next_sub_item = normalize_text(item('td:eq(0)').text()).lower() if start_times and next_sub_item.startswith(self.__stop_offices_keywords): start_times = False continue if not start_times and next_sub_item.startswith(self.__start_offices_keywords): start_times = True point.phones += self.__parse_phones(item) continue if start_times and len(item('td')) >= 2: time_items.append(u'%s: %s' % (item('td:eq(0)').text(), item('td:eq(1)').text())) continue if point: point.time = normalize_time(', '.join(time_items)) point.lat, point.lng = self.__get_point_coordinate(point.address, coordinates) if point.lat and point.lng: point.check_coordinates = CHECK_OFFICIAL else: warning_not_official_coordinates(point) points.append(point) return points