def task_collect_adv_data(self, grab, task): self.stats['taken'] += 1 one = [] for i in grab.doc.select('//@href'): one.append(i.text()) url_adv = re.findall(r'adv-\d+\.\w+', ','.join(one))[0::3] for one_adv in url_adv: self.stats['processed'] += 1 addition = task.get('addition') g = Grab() g.go(DOMEN + one_adv) advert = Advert() advert.category_id = CATEGORIES[addition['category']] advert.city_id = self.city_id advert.author_id = self.author_id advert.link = DOMEN + one_adv categories = g.doc.select( '//table[@class="adv_info_table"]').text() if g.doc.select('//h2[@class="pagetitle"]').text(): title = g.doc.select('//h2[@class="pagetitle"]').text() advert.title = title price = g.doc.select('//td[@class="adv-price"]').text() if price: price_search = re.findall('\d+', price) price_one = "" for i in price_search: price_one += i advert.price_uah = int(price_one) else: advert.price_uah = 1 if u"Описание:" in categories: text = g.doc.select( '//td[@style="border-bottom:none;"][@colspan="2"]').text() if text: advert.main_text = text if u"Телефон:" in categories: phones = re.sub( u'Телефон:', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Телефон').text()) if phones: phon = re.sub(r'\-', "", phones) advert.raw_phones = phon if u"Имя, фамилия:" in categories: contact = re.sub( u'Имя, фамилия:', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Имя').text()) if contact: advert.contact_name = contact if advert.category_id in [21, 11, 12]: extra_object = ExtraFlat() if u"Этажность" in categories: floors = re.sub( u'Этажность ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Этажность').text()) if floors: extra_object.floors = floors if u"Этаж" in categories: floor = re.sub( u'Этаж ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Этаж').text()) if floor: extra_object.floor = floor if u"Количество комнат" in categories: rooms_number = re.sub( u'Количество комнат ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Количество комнат').text()) if rooms_number: extra_object.rooms_number = rooms_number if u"Общая площадь" in categories: area = re.sub( u'Общая площадь ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Общая площадь').text()) area_search = re.search('\d+', area) if area_search: extra_object.total_area = area_search.group() if advert.category_id in [14]: extra_object = ExtraHouse() if u"Этажность" in categories: floors = re.sub( u'Этажность ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Этажность').text()) if floors: extra_object.floors = floors if u"Общая площадь" in categories: area = re.sub( u'Общая площадь ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Общая площадь').text()) area_search = re.search('\d+', area) if area_search: extra_object.total_area = area_search.group() if advert.category_id in [16]: extra_object = ExtraLot() if u"Общая площадь" in categories: area = re.sub( u'Общая площадь ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Общая площадь').text()) area_search = re.search('\d+', area) if area_search: extra_object.total_area = area_search.group() if self.metro_marker: metroc = advert.detect_metro_id(self.metro_marker) if metroc: advert.metro_id = metroc if u"Метро" in categories: metro = re.sub( u'Метро ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Метро').text()) if metro: advert.metro_id = METRO_PREM[metro] if self.sublocality_marker: subloc = advert.detect_sublocality_id(self.sublocality_marker) if subloc: advert.sublocality_id = subloc if u"Район" in categories: subloc = re.sub( u'Район ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Район').text()) if subloc: advert.sublocality_id = SUB_PREM[subloc] same_adv = Advert.objects.filter( category_id=CATEGORIES[addition['category']], author_id=self.author_id, city_id=self.city_id, link=advert.link, ).first() if same_adv: self.stats['omited'] += 1 if same_adv.date_of_update < (timezone.now() - datetime.timedelta(hours=20)): same_adv.date_of_update = timezone.now() same_adv.save() self.stats['date_of_update'] += 1 continue advert.save() photo_grab = g.clone() photo_grab.setup(proxy_auto_change=False, reuse_referer=False) img = [] for i in g.doc.select( '//a[@data-lightbox="advertisement-images"]/@href'): img.append(i.text()) for photo in img: photo_name_except = re.search(r'\d{8}', photo).group() photo_link = '%s%s' % (DOMEN2, photo) photos = [] sleep(0.2) try: photo_grab.go(photo_link) if photo_grab.response.code == 200 and \ re.match('image/', photo_grab.response.headers['Content-Type']): photos.append({ 'body': photo_grab.response.body, 'extention': RE_EXTENTION.search( photo_grab.config['url']).group() }) except GrabNetworkError as error: photo_links2.append(photo_link) photo = Photo(advert_id=advert.id) try: file_name = '%s.%s' % (hashlib.md5( photo_grab.config['url']).hexdigest(), photos[0]['extention']) except UnicodeEncodeError: file_name = file_name = '%s.%s' % (hashlib.md5( photo_name_except).hexdigest(), photos[0]['extention']) photo.photo.save(file_name, ContentFile(photos[0]['body'])) if extra_object: extra_object.advert = advert extra_object.save() self.stats['saved'] += 1
def task_collect_adv_data(self, grab, task): self.stats['taken'] += 1 one = [] url_adv = [] for i in grab.doc.select('//@href'): one.append(i.text()) for item in one: if DOMEN[self.city][29:] == 'kharkov': if item.startswith('/nedvizhimost/xarkov-'): url_adv.append(item) else: continue else: if item.startswith('/nedvizhimost/%s-' % DOMEN[self.city][29:]): url_adv.append(item) else: continue for one_adv in url_adv: self.stats['processed'] += 1 addition = task.get('addition') g = Grab() g.go(DOMEN[self.city][:15] + one_adv) advert = Advert() advert.category_id = CATEGORIES[addition['category']] advert.city_id = self.city_id advert.author_id = self.author_id advert.link = DOMEN[self.city][:15] + one_adv categories = g.doc.select( '//div[@id="content_objectTabWidgetinfo_tab"]').text() if g.doc.select('//h1').text(): title = g.doc.select('//h1').text() advert.title = title if g.doc.select('//p[@itemprop="average"]').text(): numlist = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] price = "" for i in g.doc.select('//p[@itemprop="average"]').text(): if i in numlist: price += i advert.price_uah = int(price) if g.doc.select('//div[@class="objava_define"]').text(): text = g.doc.select('//div[@class="objava_define"]').text() advert.main_text = text if g.doc.select('//p[@class="tel_user_obj tel"]').text(): phones = g.doc.select('//p[@class="tel_user_obj tel"]').text() advert.raw_phones = phones if g.doc.select('//a[@class="ceeboxAuto"]').text(): contact = g.doc.select('//a[@class="ceeboxAuto"]').text() advert.contact_name = contact if advert.category_id in [21, 11, 24, 27, 17]: extra_object = ExtraFlat() if u"Этаж" in categories: separator1 = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Этаж').text().find(":") both = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Этаж').text()[separator1 + 2:] separator2 = both.find("/") floors = both[separator2 + 1:] floor = both[:separator2] extra_object.floors = floors extra_object.floor = floor if u"Комнат" in categories: separator = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Комнат').text().find(":") rooms_number = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Комнат').text()[separator + 2:separator + 3] extra_object.rooms_number = rooms_number if u"Общая" in categories: separator1 = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Общая').text().find(":") full_area = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Общая').text()[separator1 + 2:] separator2 = full_area.find(" ") area = full_area[:separator2] extra_object.total_area = area if advert.category_id in [14]: extra_object = ExtraHouse() if u"Этажей" in categories: floors = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Этажей').text()[8:] extra_object.floors = floors if u"Общая" in categories: separator1 = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Общая').text().find(":") full_area = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Общая').text()[separator1 + 2:] separator2 = full_area.find(" ") area = full_area[:separator2] extra_object.total_area = area if advert.category_id in [16, 26]: extra_object = ExtraLot() if u"Площадь" in categories: separator1 = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Площадь').text().find(":") area = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Площадь').text()[separator1 + 2:] extra_object.total_area = area if u"Под ком. заст." or u"Под жил. заст." in categories: granted = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Под').text() extra_object.intended_purpose = granted if u"Метро" in categories: separator = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Метро').text().find("-") metro = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Метро').text()[7:separator - 2] if metro == u'Дворец Спорта': advert.metro_id = 76 elif metro == u'Дружбы Народов': advert.metro_id = 79 elif metro == u'Красный Хутор': advert.metro_id = 87 elif metro == u'Демеевская': advert.metro_id = 66 elif metro == u'Советской армии': advert.metro_id = 21 elif metro == u'Маршала Жукова': advert.metro_id = 12 elif metro == u'Метростроителей им. Ващенко': advert.metro_id = 13 elif metro == u'им. А.С. Масельского': advert.metro_id = 9 else: advert.metro_id = METRO_CIUA[metro] if g.doc.select("//div[@class='kratkost']").text(): if DOMEN[self.city][29:] == 'kharkov': separator1 = g.doc.select( "//div[@class='kratkost']").text().find(",") fulladress = g.doc.select( "//div[@class='kratkost']").text()[separator1 + 11:] separator2 = fulladress.find(",") subloc = fulladress[:separator2] else: separator1 = g.doc.select( "//div[@class='kratkost']").text().find(",") fulladress = g.doc.select( "//div[@class='kratkost']").text()[separator1 + 2:] separator2 = fulladress.find(",") subloc = fulladress[:separator2] advert.sublocality_id = SUB_CIUA[subloc] same_adv = Advert.objects.filter( category_id=CATEGORIES[addition['category']], author_id=self.author_id, city_id=self.city_id, link=advert.link, ).first() if same_adv: self.stats['omited'] += 1 if same_adv.date_of_update < (timezone.now() - datetime.timedelta(hours=20)): same_adv.date_of_update = timezone.now() same_adv.save() self.stats['date_of_update'] += 1 continue advert.save() photo_grab = g.clone() photo_grab.setup(proxy_auto_change=False, reuse_referer=False) img = [] for i in g.doc.select('//@href'): if i.text().startswith("/pic/objects/"): img.append(i.text()) else: continue for photo in img: photo_name_except = photo[22:54] photo_link = '%s%s' % (DOMEN[self.city][:15], photo) photos = [] photo_links2 = [] sleep(0.2) try: photo_grab.go(photo_link) if photo_grab.response.code == 200 and \ re.match('image/', photo_grab.response.headers['Content-Type']): photos.append({ 'body': photo_grab.response.body, 'extention': RE_EXTENTION.search( photo_grab.config['url']).group() }) except GrabNetworkError as error: photo_links2.append(photo_link) photo = Photo(advert_id=advert.id) try: file_name = '%s.%s' % (hashlib.md5( photo_grab.config['url']).hexdigest(), photos[0]['extention']) except UnicodeEncodeError: file_name = file_name = '%s.%s' % (hashlib.md5( photo_name_except).hexdigest(), photos[0]['extention']) photo.photo.save(file_name, ContentFile(photos[0]['body'])) if extra_object: extra_object.advert = advert extra_object.save() self.stats['saved'] += 1
def task_collect_adv_data(self, grab, task): self.stats['taken'] += 1 one = [] for i in grab.doc.select('//@href'): one.append(i.text()) e = re.findall(r'/offers/\d+', ''.join(one))[::3] for one_adv in e: extra_object = ExtraHouse() self.stats['processed'] += 1 addition = task.get('addition') g = Grab() g.go(DOMEN+one_adv) advert = Advert() advert.category_id = CATEGORIES[addition['category']] advert.city_id = self.city_id advert.author_id = self.author_id advert.link = DOMEN+one_adv if g.doc.select('//article[@class="article"]/h1').text(): title = g.doc.select('//article[@class="article"]/h1').text() advert.title = title[:90] if g.doc.select('//div[@class="box"]/p').text(): text = g.doc.select('//div[@class="box"]/p').text() advert.main_text = text advert.sublocality_id = advert.detect_sublocality_id(self.sublocality_marker) advert.metro_id = advert.detect_metro_id(self.metro_marker) if g.doc.select('//div[@class="col-xs-6 col-md-5"]').text(): phone = g.doc.select('//div[@class="col-xs-6 col-md-5"]').text() advert.raw_phones = '0'+''.join(re.findall(r'\d+', phone)) if g.doc.select('//div[@class="col-xs-12 col-md-8"]').text(): prise = g.doc.select('//div[@class="col-xs-12 col-md-8"]').text() advert.price_uah = int(''.join(re.findall(r'\d+', prise))) """ Продолжить тут """ text = g.doc.select('//td[@valign="top"][@width="100%"]').text() text = text.replace(advert.title, '') her = text.replace(advert.main_text, '') try: extra_object.total_area =re.search(r'\d+', re.search(u'Площадь дома\s+\-\s+\d+',her).group()).group() except AttributeError: a = 1 try: extra_object.lot_area =re.search(r'\d+', re.search(u'Площадь участка\s+\-\s+\d+',her).group()).group() except AttributeError: a = 1 try: extra_object.floors =re.search(r'\d+', re.search(u'Этажей\s+\-\s+\d+',her).group()).group() except AttributeError: a = 1 same_adv = Advert.objects.filter( category_id=CATEGORIES[addition['category']], author_id=self.author_id, city_id=self.city_id, link=advert.link, ).first() if same_adv: self.stats['omited'] += 1 if same_adv.date_of_update < ( timezone.now() - datetime.timedelta(hours=20)): same_adv.date_of_update = timezone.now() same_adv.save() self.stats['date_of_update'] += 1 continue advert.save() img = [] for i in g.doc.select('//div[@class="thumb"]/a/@href'): img.append(i.text()) for photo in img: q = Grab() photos = [] photo_links2 = [] sleep(0.2) try: q.go(photo) if q.response.code == 200 and \ re.match('image/', q.response.headers['Content-Type']): photos.append({ 'body': q.response.body, 'extention': RE_EXTENTION.search(q.config['url']).group() }) except GrabNetworkError as error: photo_links2.append(photo) photo = Photo(advert_id=advert.id) try: file_name = '%s.%s' % ( hashlib.md5(q.config['url']).hexdigest(), photos[0]['extention'] ) except IndexError: continue photo.photo.save(file_name, ContentFile(photos[0]['body'])) if extra_object: extra_object.advert = advert extra_object.save() self.stats['saved'] += 1
def task_collect_adv_data(self, grab, task): # print "////////collect adv data////////////" # if there is no phone it doesn't make sense to take other data sleep(1) self.stats['taken'] += 1 if grab.doc.select("//li[%s]" % xpcs('link-phone')): addition = task.get('addition') # print '////////create new Advert object///////////' advert = Advert() advert.category_id = CATEGORIES[addition['category']] advert.city_id = self.city_id advert.author_id = self.author_id advert.link = RE_ADV_LINK.search(grab.config['url']).group() advert.title = grab.doc.select("//h1").text() price = grab.doc.select("//div[%s]/strong" % xpcs('pricelabel')).text() currency = 'uah' if u'грн' in price else 'usd' if currency == 'uah': advert.price_uah = int(RE_NON_DIGIT.sub('', price)) advert.price_usd = advert.price_uah / USD_UAH else: advert.price_usd = int(RE_NON_DIGIT.sub('', price)) advert.price_uah = advert.price_usd * USD_UAH # think about location advert.main_text = grab.doc.select( "//div[@id='textContent']/p").text() address = grab.doc.select("//span[%s]/strong" % xpcs('show-map-link')).text().split(',') if len(address) > 3: advert.street = address[3] if self.city_id == 8 and len(address) >= 3: kiev_big_subloc = BigSublocality.objects.filter( name__startswith=address[2].strip()).first() if kiev_big_subloc: advert.big_sublocality_id = kiev_big_subloc.id extra_action = None if advert.category_id in (21, 22, 24, 26, 27): extra_action = ExtraRent() if advert.category_id in (11, 12, 21, 22): extra_object = ExtraFlat() if advert.category_id in (14, 24): extra_object = ExtraHouse() if advert.category_id in (16, 26): extra_object = ExtraLot() if advert.category_id in (17, 27): extra_object = ExtraCommercial() if addition['category'] in ('arenda-kvartir', 'prodazha-kvartir', 'prodazha-komnat'): rooms_number = grab.doc.select( "//table[@class='item'][contains(., '%s')]" % u'Количество комнат') if rooms_number: extra_object.rooms_number = RE_DIGIT.search( rooms_number.select( ".//td[@class='value']").text()).group() total_area = grab.doc.select( "//table[@class='item'][contains(., '%s')]" % u'Общая площадь') if total_area: extra_object.total_area = RE_DIGIT.search( total_area.select( ".//td[@class='value']").text()).group() floor = grab.doc.select( "//table[@class='item'][contains(., '%s')]" % u'Этаж') if floor: extra_object.floor = RE_DIGIT.search( floor.select(".//td[@class='value']").text()).group() floors = grab.doc.select( "//table[@class='item'][contains(., '%s')]" % u'Этажность дома') if floors: extra_object.floors = RE_DIGIT.search( floors.select(".//td[@class='value']").text()).group() if 'arenda' in addition['category']: rent_term = grab.doc.select( "//table[@class='item'][contains(., '%s')]" % u'Тип аренды') rent_term = rent_term.select(".//td[@class='value']").text( ) if rent_term else u'Долгосрочная аренда' extra_action.term = 2 if u'Долгосрочная' in rent_term else 1 if addition['category'] == 'arenda-komnat': rooms_number = grab.doc.select( "//table[@class='item'][contains(., '%s')]" % u'Всего комнат') if rooms_number: extra_object.rooms_number = RE_DIGIT.search( rooms_number.select( ".//td[@class='value']").text()).group() if addition['category'] in ('arenda-domov', 'prodazha-domov'): total_area = grab.doc.select( "//table[@class='item'][contains(., '%s')]" % u'Площадь дома') if total_area: extra_object.total_area = RE_DIGIT.search( total_area.select( ".//td[@class='value']").text()).group() if addition['category'] == 'prodazha-domov': house_type = grab.doc.select( "//table[@class='item'][contains(., '%s')]" % u'Тип дома') house_type = house_type.select( ".//td[@class='value']").text() if house_type else None extra_object.house_type = 2 if house_type == u'Продажа дач' else 1 if addition['category'] == 'prodazha-kvartir': building = grab.doc.select( "//table[@class='item'][contains(., '%s')]" % u'Тип квартиры') building = building.select( ".//td[@class='value']").text() if building else '' if u'Новостройки' in building: extra_object.new_building = True if addition['category'] == 'prodazha-zemli': lot_purpose = grab.doc.select( "//table[@class='item'][contains(., '%s')]" % u'Тип участка') lot_purpose = lot_purpose.select( ".//td[@class='value']").text() if lot_purpose else '' if u'сад / огород' in lot_purpose: extra_object.intended_purpose = 'садоводство' elif u'индивидуальное строительство' in lot_purpose: extra_object.intended_purpose = 'под застройку' elif u'сельскохозяйственного назначения' in lot_purpose: extra_object.intended_purpose = u'ОСГ(особисте селянське господарство)' elif u'промышленного назначения' in lot_purpose: extra_object.intended_purpose = u'коммерческого назначения' lot_area = grab.doc.select( "//table[@class='item'][contains(., '%s')]" % u'Площадь') lot_area = RE_DIGIT.search( lot_area.select(".//td[@class='value']").text()).group( ) if lot_area else None if lot_area: extra_object.lot_unit = u'соток' photo_links = grab.doc.select("//img[%s]" % xpcs('bigImage')) photos = [] # print '//////amount of photos %s/////////' % len(photo_links) if photo_links: photo_grab = grab.clone() photo_grab.setup(proxy_auto_change=False, reuse_referer=False) sleep(0.2) photo_links2 = [] for photo_link in photo_links: try: photo_grab.go(photo_link.attr('src')) if photo_grab.response.code == 200 and \ re.match('image/', photo_grab.response.headers['Content-Type']): photos.append({ 'body': photo_grab.response.body, 'extention': RE_EXTENTION.search( photo_grab.config['url']).group() }) except GrabNetworkError as error: # print('////error while taking photo////') photo_links2.append(photo_link) # print('////one more try///') # print(len(photo_links2)) for photo_link in photo_links2: photo_grab.go(photo_link.attr('src')) if photo_grab.response.code == 200 and \ re.match('image/', photo_grab.response.headers['Content-Type']): photos.append({ 'body': photo_grab.response.body, 'extention': RE_EXTENTION.search( photo_grab.config['url']).group() }) phone_raw = self.take_phone(grab) phone_in_text = advert.detect_phone() if phone_raw and phone_in_text: phone_raw = phone_raw + ',' + ','.join(phone_in_text) elif phone_in_text: phone_raw = ','.join(phone_in_text) if phone_raw: advert.raw_phones = phone_raw subloc = None sub_if = grab.doc.select("//strong[@class='c2b small']").text() for sub_one in SUB_IF: if sub_one in sub_if: subloc = sub_one if subloc is not None: advert.sublocality_id = int(SUB_IF[subloc]) else: advert.sublocality_id = advert.detect_sublocality_id( self.sublocality_marker) if self.metro_marker: advert.metro_id = advert.detect_metro_id(self.metro_marker) # print '//////////SAVE ADVERT/////////' advert.save() self.stats['saved'] += 1 for i, img in enumerate(photos): photo = Photo(advert_id=advert.id) file_name = '%s.%s' % (hashlib.md5(grab.config['referer'] + str(i)).hexdigest(), img['extention']) photo.photo.save(file_name, ContentFile(img['body'])) if self.extra_has_values(extra_object): extra_object.advert = advert extra_object.save() if extra_action: extra_action.advert = advert extra_action.save() else: self.stats['without_phone'] += 1
def task_collect_adv_data(self, grab, task): self.stats['taken'] += 1 one = [] url_adv = [] for i in grab.doc.select('//@href'): one.append(i.text()) for item in one: if item.startswith('view.php?ad_id=') and item not in url_adv: url_adv.append(item) else: continue for one_adv in url_adv: self.stats['processed'] += 1 addition = task.get('addition') advert = Advert() if self.city == 'kharkov': advert.category_id = CATEGORIES_khar[addition['category']] else: advert.category_id = CATEGORIES_kiev[addition['category']] advert.city_id = self.city_id advert.author_id = self.author_id advert.link = DOMEN[:13] + one_adv g = Grab() g.go(DOMEN[:13] + "print_" + one_adv) if g.doc.select("//td/p[contains(.,'%s')]" % u'Тел:').text(): phones = g.doc.select("//td/p[contains(.,'%s')]" % u'Тел:').text()[5:] advert.raw_phones = phones g.go(DOMEN[:13] + one_adv) categories = g.doc.select( '//div[@style="font-size: 11px;"]').text() if g.doc.select('//h1').text(): title = g.doc.select('//h1').text() advert.title = title if g.doc.select('//p[@class="ad-price"]').text(): numlist = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] price = "" for i in g.doc.select('//p[@class="ad-price"]').text(): if i in numlist: price += i else: continue if u'грн' in g.doc.select('//p[@class="ad-price"]').text(): if price: advert.price_uah = int(price) else: if price: advert.price_usd = int(price) if g.doc.select('//p[@class="ad-desc"]').text(): text = g.doc.select('//p[@class="ad-desc"]').text() advert.main_text = text # if g.doc.select("//div[@style='font-size: 11px;']/p[contains(.,'%s')]" %u'Контакт').text(): # contact = g.doc.select("//div[@style='font-size: 11px;']/p[contains(.,'%s')]" %u'Контакт').text()[9:] # advert.contact_name = contact if advert.category_id in [21, 11, 27, 17]: extra_object = ExtraFlat() if u"Этаж" in categories: separator1 = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Этаж').text().find(":") both = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Этаж').text()[separator1 + 2:] separator2 = both.find("/") floors = both[separator2 + 2:] floor = both[:separator2 - 1] extra_object.floors = floors extra_object.floor = floor if u"Комнат" in categories: rooms_number = g.doc.select( '//p[@class="ad-contacts"]').text()[-1] extra_object.rooms_number = rooms_number if u"Общая" in categories: separator = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Общая площадь').text().find(":") area = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Общая площадь').text()[separator + 2:-6] extra_object.total_area = area if advert.category_id in [14, 24]: extra_object = ExtraHouse() if u"Этажность" in categories: floors = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Этаж').text()[-1] extra_object.floors = floors if u"Площадь дома" in categories: separator = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Площадь дома').text().find(":") area = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Площадь дома').text()[separator + 2:-6] extra_object.total_area = area if u"Площадь участка" in categories: separator = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Площадь участка').text().find(":") area = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Площадь участка').text()[separator + 2:] extra_object.lot_area = area if advert.category_id in [16]: extra_object = ExtraLot() if u"Площадь участка" in categories: separator1 = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Площадь участка').text().find(":") area = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Площадь участка').text()[separator1 + 2:] extra_object.total_area = area if u"Под строительство" in categories: granted = g.doc.select( "//ul[@style='list-style-type: none']/li[contains(.,'%s')]" % u'Под').text() extra_object.intended_purpose = granted if g.doc.select("//h3").text(): if "," in g.doc.select("//h3").text(): subloc = g.doc.select("//h3").text() else: separator = g.doc.select("//h3").text().find(" ") subloc = g.doc.select("//h3").text()[:separator] advert.sublocality_id = SUB_FN[subloc] advert.metro_id = advert.detect_metro_id(self.metro_marker) same_adv = Advert.objects.filter( category_id=advert.category_id, author_id=self.author_id, city_id=self.city_id, link=advert.link, ).first() if same_adv: self.stats['omited'] += 1 if same_adv.date_of_update < (timezone.now() - datetime.timedelta(hours=20)): same_adv.date_of_update = timezone.now() same_adv.save() self.stats['date_of_update'] += 1 continue advert.save() photo_grab = g.clone() photo_grab.setup(proxy_auto_change=False, reuse_referer=False) img = [] for i in g.doc.select('//@href'): if i.text().startswith("./upload/pics/"): img.append(i.text()) else: continue for photo in img: photo_name_except = photo[14:-4] photo_link = '%s%s' % (DOMEN[:13], photo) photos = [] photo_links2 = [] sleep(0.2) try: photo_grab.go(photo_link) if photo_grab.response.code == 200 and \ re.match('image/', photo_grab.response.headers['Content-Type']): photos.append({ 'body': photo_grab.response.body, 'extention': RE_EXTENTION.search( photo_grab.config['url']).group() }) except GrabNetworkError as error: photo_links2.append(photo_link) photo = Photo(advert_id=advert.id) try: file_name = '%s.%s' % (hashlib.md5( photo_grab.config['url']).hexdigest(), photos[0]['extention']) except UnicodeEncodeError: file_name = file_name = '%s.%s' % (hashlib.md5( photo_name_except).hexdigest(), photos[0]['extention']) photo.photo.save(file_name, ContentFile(photos[0]['body'])) if extra_object: extra_object.advert = advert extra_object.save() self.stats['saved'] += 1
def create_advert(self, raw): grab = Grab() self.stats['taken'] += 1 self.stats['processed'] += 1 # general fields extra_object = None adv = Advert() adv.city_id = self.city_id adv.author_id = self.author_id if 'priceArr' in raw: if raw['priceArr']['3']: price = re.sub(r'\s', '', raw['priceArr']['3']) adv.price_uah = int(price) else: adv.price_uah = 1 if 'description' in raw: adv.main_text = raw['description'] if 'user' in raw: adv.contact_name = raw['user']['name'] if 'user_id' in raw: us_id = raw['user_id'] p = Grab() p.go(URLS_ID % us_id) main = p.doc.select("//li[@class='fieldWrap']").text() adv.raw_phones = re.sub(r'\s|\(|\)|\-', '', main) if 'street_name' in raw: adv.street = raw['street_name'] if 'beautiful_url' in raw: url_domria = raw['beautiful_url'] adv.link = (LINK_DOM % url_domria) if raw['advert_type_id'] in [1]: if raw['realty_type_name'] in [u'квартира', u'Квартира']: extra_object = ExtraFlat() adv.category_id = CATEGORIES['prodazha-kvartir'] titles = u'Продажа квартиры %s' if 'floors_count' in raw: extra_object.floors = raw['floors_count'] if 'floor' in raw: extra_object.floor = raw['floor'] if 'rooms_count' in raw: extra_object.rooms_number = raw['rooms_count'] if 'total_square_meters' in raw: extra_object.total_area = raw['total_square_meters'] if raw['realty_type_name'] in [u'дом', u'Дом']: extra_object = ExtraHouse() adv.category_id = CATEGORIES['prodazha-domov'] titles = u'Продажа дома %s' if 'total_square_meters' in raw: extra_object.total_area = raw['total_square_meters'] if raw['advert_type_id'] in [3, 4]: if raw['realty_type_name'] in [u'квартира', u'Квартира']: extra_object = ExtraFlat() adv.category_id = CATEGORIES['arenda-kvartir'] titles = u'Аренда квартиры %s' if 'floors_count' in raw: extra_object.floors = raw['floors_count'] if 'floor' in raw: extra_object.floor = raw['floor'] if 'rooms_count' in raw: extra_object.rooms_number = raw['rooms_count'] if 'total_square_meters' in raw: extra_object.total_area = raw['total_square_meters'] if raw['realty_type_name'] in [u'дом', u'Дом']: extra_object = ExtraHouse() adv.category_id = CATEGORIES['arenda-domov'] titles = u'Аренда дома %s' if 'total_square_meters' in raw: extra_object.total_area = raw['total_square_meters'] if 'district_name' in raw: sudlo_name = int(SUB_CIUA[raw['district_name']]) adv.sublocality_id = sudlo_name adv.title = (titles % adv.sublocality.name) if 'district_name' not in raw: if 'street_name' in raw: adv.title = (titles % raw['street_name']) else: adv.title = (titles % self.city) if self.metro_marker: if 'metro_station_name' in raw: metro_station = METRO_CIUA[raw['metro_station_name']] adv.metro_id = metro_station adv.save() photo_grab = grab.clone() photo_grab.setup(proxy_auto_change=False, reuse_referer=False) for key in raw['photos']: key_photo = raw['photos'][key]['file'] photo_link = (PHOTO_URL % (key_photo.replace('.', 'f.'))) photos = [] sleep(0.2) try: photo_grab.go(photo_link) if photo_grab.response.code == 200 and \ re.match('image/', photo_grab.response.headers['Content-Type']): photos.append({ 'body': photo_grab.response.body, 'extention': RE_EXTENTION.search(photo_grab.config['url']).group() }) except GrabNetworkError as error: photo_links2.append(photo_link) photo = Photo(advert_id=adv.id) try: file_name = '%s.%s' % (hashlib.md5( photo_grab.config['url']).hexdigest(), photos[0]['extention']) photo.photo.save(file_name, ContentFile(photos[0]['body'])) except IndexError: pass if extra_object: extra_object.advert = adv extra_object.save() self.stats['saved'] += 1
def task_collect_adv_data(self, grab, task): self.stats['taken'] += 1 one = [] for i in grab.doc.select('//@href'): one.append(i.text()) print one print grab.response.url url_adv = re.findall(r'view\.\w+\?\w+\=\d+', ','.join(one)) for one_adv in url_adv: self.stats['processed'] += 1 addition = task.get('addition') g = Grab() g.go(DOMEN[self.city] + one_adv) advert = Advert() advert.category_id = CATEGORIES[addition['category']] advert.city_id = self.city_id advert.author_id = self.author_id advert.link = re.sub(r'www.', "", DOMEN[self.city] + one_adv) if g.doc.select('//div[@class="page-header"]').text(): title = g.doc.select('//div[@class="page-header"]').text() advert.title = title[:90] if g.doc.select('//p[@style="margin-top: 0;"]').text(): text = g.doc.select('//p[@style="margin-top: 0;"]').text() advert.main_text = text adv = g.doc.select('//p[@class="phone"]').text() for i in SUB_AVIS: if i in adv: advert.sublocality_id = SUB_AVIS[i] advert.metro_id = advert.detect_metro_id(self.metro_marker) if u'Цена:' in adv: if u'грн' in adv: pr = re.search(u'Цена: \d+\s+\d+', adv) or re.search( u'Цена: \d+', adv) try: room = re.sub(u'Цена: ', "", pr.group()) advert.price_uah = int(re.sub(' ', "", room)) except AttributeError: advert.price_uah = 1 if u'у.е.' in adv: pr = re.search(u'Цена: \d+\s+\d+', adv) or re.search( u'Цена: \d+', adv) try: room = re.sub(u'Цена: ', "", pr.group()) advert.price_usd = int(re.sub(' ', "", room)) except AttributeError: advert.price_usd = 1 if u'Тел:' in adv: phones = re.sub(r'[\s\-\(\)]', '', adv) phones = re.search(r'\d{9,10}(?=\D|$)', phones) advert.raw_phones = phones.group() if CATEGORIES[addition['category']] in [11, 12, 21, 22]: extra_object = ExtraFlat() if u'Комнат:' in adv: lol = re.search(u'Комнат: \d+', adv) room = re.sub(u'Комнат: ', "", lol.group()) extra_object.rooms_number = int(room) if CATEGORIES[addition['category']] in [14, 24]: extra_object = ExtraHouse() if CATEGORIES[addition['category']] in [16]: extra_object = ExtraLot() same_adv = Advert.objects.filter( category_id=CATEGORIES[addition['category']], author_id=self.author_id, city_id=self.city_id, link=advert.link, ).first() if same_adv: self.stats['omited'] += 1 if same_adv.date_of_update < (timezone.now() - datetime.timedelta(hours=20)): same_adv.date_of_update = timezone.now() same_adv.save() self.stats['date_of_update'] += 1 continue advert.save() img = [] for i in g.doc.select( '//li[@class="span2"]/a[@class="thumbnail"]/@href'): img.append(i.text()[2:]) for i in g.doc.select('//div[@class="item active"]/img/@src'): img.append(i.text()[2:]) for photo in img: q = Grab() photos = [] photo_links2 = [] sleep(0.2) try: q.go(photo) if q.response.code == 200 and \ re.match('image/', q.response.headers['Content-Type']): photos.append({ 'body': q.response.body, 'extention': RE_EXTENTION.search(q.config['url']).group() }) except GrabNetworkError as error: photo_links2.append(photo) photo = Photo(advert_id=advert.id) file_name = '%s.%s' % (hashlib.md5( q.config['url']).hexdigest(), photos[0]['extention']) photo.photo.save(file_name, ContentFile(photos[0]['body'])) if extra_object: extra_object.advert = advert extra_object.save() self.stats['saved'] += 1
def task_collect_adv_data(self, grab, task): self.stats['taken'] += 1 one = [] for i in grab.doc.select('//a[@class="avstd"]/@href'): one.append(i.text()) for one_adv in one: sleep(0.5) self.stats['processed'] += 1 addition = task.get('addition') g = Grab() g.go(one_adv) advert = Advert() advert.category_id = CATEGORIES[addition['category']] advert.city_id = self.city_id advert.author_id = self.author_id advert.link = one_adv advert.title = g.doc.select('//h1').text()[:90] text = [] for i in g.doc.select('//td[@colspan="2"]'): text.append(i.text()) for i, img in enumerate(text): if u'Дополнительно : ' in img: advert.main_text = re.sub(u'Дополнительно : ', '', text[i]) advert.sublocality_id = advert.detect_sublocality_id( self.sublocality_marker) advert.metro_id = advert.detect_metro_id(self.metro_marker) prise = g.doc.select('//font[@size="3"]/b').text() price_uah = re.sub(r' ', '', prise) advert.price_uah = int(price_uah) mayn = [] for i in g.doc.select( '//table[@border="0"][@cellpadding="2"][@cellspacing="0"][@align="center"][@width="100%"]/tr' ): mayn.append(i.text()) phone = re.findall(u'Teлефоны : \d+\-\d+\-\d+', ''.join(mayn)) phone2 = re.findall(u'Teлефон : \d+\-\d+\-\d+', ''.join(mayn)) phone3 = re.findall(u'Teлефоны : \d+\-\d+\, \d+\-\d+\-\d+', ''.join(mayn)) if phone: advert.raw_phones = ''.join(re.findall(r'\d+', ''.join(phone))) if phone2: advert.raw_phones = ''.join(re.findall(r'\d+', ''.join(phone2))) if phone3: phones = re.sub(r'\s|-', '', ''.join(phone3)) advert.raw_phones = ''.join(re.findall(r'\d{8,12}', phones)) objects = [] for i in g.doc.select( '//table[@border="0"][@cellpadding="5"][@cellspacing="0"][@width="100%"]' ): objects.append(i.text()) if CATEGORIES[addition['category']] in [11, 21]: extra_object = ExtraFlat() for i, img in enumerate(objects): if u'Комнат / тип: ' in img: rooms_number = re.findall(u'Комнат / тип: \d+', objects[i]) extra_object.rooms_number = ''.join( re.findall(r'\d+', ''.join(rooms_number))) if u'Этаж/этажность: ' in img: floor = re.findall(u'Этаж/этажность: \d+', objects[i]) extra_object.floor = ''.join( re.findall(r'\d+', ''.join(floor))) if u' общая' in img: total_area = re.findall(u'\d+ \- общая', objects[i]) extra_object.total_area = ''.join( re.findall(r'\d+', ''.join(total_area))) if CATEGORIES[addition['category']] in [14, 24]: extra_object = ExtraHouse() for i, img in enumerate(objects): if u' общая' in img: total_area = re.findall(u'\d+ \- общая', objects[i]) extra_object.total_area = ''.join( re.findall(r'\d+', ''.join(total_area))) if u'Этажность ' in img: floor = re.findall(u'Этажность \d+', objects[i]) extra_object.floor = ''.join( re.findall(r'\d+', ''.join(floor))) if CATEGORIES[addition['category']] in [16]: extra_object = ExtraLot() for i, img in enumerate(objects): if u'Площадь : ' in img: total_area = re.findall(u'Площадь : \d+', objects[i]) extra_object.total_area = ''.join( re.findall(r'\d+', ''.join(total_area))) sleep(0.5) same_adv = Advert.objects.filter( category_id=CATEGORIES[addition['category']], author_id=self.author_id, city_id=self.city_id, link=advert.link, ).first() if same_adv: self.stats['omited'] += 1 if same_adv.date_of_update < (timezone.now() - datetime.timedelta(hours=20)): same_adv.date_of_update = timezone.now() same_adv.save() self.stats['date_of_update'] += 1 continue advert.save() img = [] for i in g.doc.select( '//td[@class="tBrd1p"][@align="center"][@valign="middle"]/a/img/@src' ): img.append(re.sub(r'\?t=\S+', '', i.text())) for photo in img: q = Grab() photo_link = '%s%s' % (DOMEN, photo) photos = [] photo_links2 = [] sleep(0.2) try: q.go(photo_link) if q.response.code == 200 and \ re.match('image/', q.response.headers['Content-Type']): photos.append({ 'body': q.response.body, 'extention': RE_EXTENTION.search(q.config['url']).group() }) except GrabNetworkError as error: photo_links2.append(photo) photo = Photo(advert_id=advert.id) file_name = '%s.%s' % (hashlib.md5( q.config['url']).hexdigest(), photos[0]['extention']) photo.photo.save(file_name, ContentFile(photos[0]['body'])) if extra_object: extra_object.advert = advert extra_object.save() self.stats['saved'] += 1