Beispiel #1
0
    def task_collect_adv_data(self, grab, task):
        self.stats['taken'] += 1
        one = []
        for i in grab.doc.select('//@href'):
            one.append(i.text())
        url_adv = re.findall(r'adv-\d+\.\w+', ','.join(one))[0::3]
        for one_adv in url_adv:
            self.stats['processed'] += 1
            addition = task.get('addition')
            g = Grab()
            g.go(DOMEN + one_adv)
            advert = Advert()
            advert.category_id = CATEGORIES[addition['category']]
            advert.city_id = self.city_id
            advert.author_id = self.author_id
            advert.link = DOMEN + one_adv
            categories = g.doc.select(
                '//table[@class="adv_info_table"]').text()

            if g.doc.select('//h2[@class="pagetitle"]').text():
                title = g.doc.select('//h2[@class="pagetitle"]').text()
                advert.title = title
            price = g.doc.select('//td[@class="adv-price"]').text()
            if price:
                price_search = re.findall('\d+', price)
                price_one = ""
                for i in price_search:
                    price_one += i
                advert.price_uah = int(price_one)
            else:
                advert.price_uah = 1
            if u"Описание:" in categories:
                text = g.doc.select(
                    '//td[@style="border-bottom:none;"][@colspan="2"]').text()
                if text:
                    advert.main_text = text
            if u"Телефон:" in categories:
                phones = re.sub(
                    u'Телефон:', "",
                    g.doc.select(
                        "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                        % u'Телефон').text())
                if phones:
                    phon = re.sub(r'\-', "", phones)
                    advert.raw_phones = phon
            if u"Имя, фамилия:" in categories:
                contact = re.sub(
                    u'Имя, фамилия:', "",
                    g.doc.select(
                        "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                        % u'Имя').text())
                if contact:
                    advert.contact_name = contact
            if advert.category_id in [21, 11, 12]:
                extra_object = ExtraFlat()
                if u"Этажность" in categories:
                    floors = re.sub(
                        u'Этажность ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Этажность').text())
                    if floors:
                        extra_object.floors = floors
                if u"Этаж" in categories:
                    floor = re.sub(
                        u'Этаж ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Этаж').text())
                    if floor:
                        extra_object.floor = floor
                if u"Количество комнат" in categories:
                    rooms_number = re.sub(
                        u'Количество комнат ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Количество комнат').text())
                    if rooms_number:
                        extra_object.rooms_number = rooms_number
                if u"Общая площадь" in categories:
                    area = re.sub(
                        u'Общая площадь ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Общая площадь').text())
                    area_search = re.search('\d+', area)
                    if area_search:
                        extra_object.total_area = area_search.group()
            if advert.category_id in [14]:
                extra_object = ExtraHouse()
                if u"Этажность" in categories:
                    floors = re.sub(
                        u'Этажность ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Этажность').text())
                    if floors:
                        extra_object.floors = floors
                if u"Общая площадь" in categories:
                    area = re.sub(
                        u'Общая площадь ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Общая площадь').text())
                    area_search = re.search('\d+', area)
                    if area_search:
                        extra_object.total_area = area_search.group()
            if advert.category_id in [16]:
                extra_object = ExtraLot()
                if u"Общая площадь" in categories:
                    area = re.sub(
                        u'Общая площадь ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Общая площадь').text())
                    area_search = re.search('\d+', area)
                    if area_search:
                        extra_object.total_area = area_search.group()
            if self.metro_marker:
                metroc = advert.detect_metro_id(self.metro_marker)
                if metroc:
                    advert.metro_id = metroc
                if u"Метро" in categories:
                    metro = re.sub(
                        u'Метро ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Метро').text())
                    if metro:
                        advert.metro_id = METRO_PREM[metro]
            if self.sublocality_marker:
                subloc = advert.detect_sublocality_id(self.sublocality_marker)
                if subloc:
                    advert.sublocality_id = subloc
                if u"Район" in categories:
                    subloc = re.sub(
                        u'Район ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Район').text())
                    if subloc:
                        advert.sublocality_id = SUB_PREM[subloc]
            same_adv = Advert.objects.filter(
                category_id=CATEGORIES[addition['category']],
                author_id=self.author_id,
                city_id=self.city_id,
                link=advert.link,
            ).first()
            if same_adv:
                self.stats['omited'] += 1
                if same_adv.date_of_update < (timezone.now() -
                                              datetime.timedelta(hours=20)):
                    same_adv.date_of_update = timezone.now()
                    same_adv.save()
                    self.stats['date_of_update'] += 1
                continue
            advert.save()
            photo_grab = g.clone()
            photo_grab.setup(proxy_auto_change=False, reuse_referer=False)
            img = []
            for i in g.doc.select(
                    '//a[@data-lightbox="advertisement-images"]/@href'):
                img.append(i.text())
            for photo in img:
                photo_name_except = re.search(r'\d{8}', photo).group()
                photo_link = '%s%s' % (DOMEN2, photo)
                photos = []
                sleep(0.2)
                try:
                    photo_grab.go(photo_link)
                    if photo_grab.response.code == 200 and \
                            re.match('image/', photo_grab.response.headers['Content-Type']):
                        photos.append({
                            'body':
                            photo_grab.response.body,
                            'extention':
                            RE_EXTENTION.search(
                                photo_grab.config['url']).group()
                        })
                except GrabNetworkError as error:
                    photo_links2.append(photo_link)
                photo = Photo(advert_id=advert.id)
                try:
                    file_name = '%s.%s' % (hashlib.md5(
                        photo_grab.config['url']).hexdigest(),
                                           photos[0]['extention'])
                except UnicodeEncodeError:
                    file_name = file_name = '%s.%s' % (hashlib.md5(
                        photo_name_except).hexdigest(), photos[0]['extention'])
                photo.photo.save(file_name, ContentFile(photos[0]['body']))
            if extra_object:
                extra_object.advert = advert
                extra_object.save()
            self.stats['saved'] += 1
Beispiel #2
0
    def task_collect_adv_data(self, grab, task):
        self.stats['taken'] += 1
        one = []
        url_adv = []
        for i in grab.doc.select('//@href'):
            one.append(i.text())
        for item in one:
            if DOMEN[self.city][29:] == 'kharkov':
                if item.startswith('/nedvizhimost/xarkov-'):
                    url_adv.append(item)
                else:
                    continue
            else:
                if item.startswith('/nedvizhimost/%s-' %
                                   DOMEN[self.city][29:]):
                    url_adv.append(item)
                else:
                    continue
        for one_adv in url_adv:
            self.stats['processed'] += 1
            addition = task.get('addition')
            g = Grab()
            g.go(DOMEN[self.city][:15] + one_adv)
            advert = Advert()
            advert.category_id = CATEGORIES[addition['category']]
            advert.city_id = self.city_id
            advert.author_id = self.author_id
            advert.link = DOMEN[self.city][:15] + one_adv

            categories = g.doc.select(
                '//div[@id="content_objectTabWidgetinfo_tab"]').text()

            if g.doc.select('//h1').text():
                title = g.doc.select('//h1').text()
                advert.title = title
            if g.doc.select('//p[@itemprop="average"]').text():
                numlist = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
                price = ""
                for i in g.doc.select('//p[@itemprop="average"]').text():
                    if i in numlist:
                        price += i
                advert.price_uah = int(price)
            if g.doc.select('//div[@class="objava_define"]').text():
                text = g.doc.select('//div[@class="objava_define"]').text()
                advert.main_text = text
            if g.doc.select('//p[@class="tel_user_obj tel"]').text():
                phones = g.doc.select('//p[@class="tel_user_obj tel"]').text()
                advert.raw_phones = phones
            if g.doc.select('//a[@class="ceeboxAuto"]').text():
                contact = g.doc.select('//a[@class="ceeboxAuto"]').text()
                advert.contact_name = contact
            if advert.category_id in [21, 11, 24, 27, 17]:
                extra_object = ExtraFlat()
                if u"Этаж" in categories:
                    separator1 = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Этаж').text().find(":")
                    both = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Этаж').text()[separator1 + 2:]
                    separator2 = both.find("/")
                    floors = both[separator2 + 1:]
                    floor = both[:separator2]
                    extra_object.floors = floors
                    extra_object.floor = floor
                if u"Комнат" in categories:
                    separator = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Комнат').text().find(":")
                    rooms_number = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Комнат').text()[separator + 2:separator + 3]
                    extra_object.rooms_number = rooms_number
                if u"Общая" in categories:
                    separator1 = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Общая').text().find(":")
                    full_area = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Общая').text()[separator1 + 2:]
                    separator2 = full_area.find(" ")
                    area = full_area[:separator2]
                    extra_object.total_area = area
            if advert.category_id in [14]:
                extra_object = ExtraHouse()
                if u"Этажей" in categories:
                    floors = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Этажей').text()[8:]
                    extra_object.floors = floors
                if u"Общая" in categories:
                    separator1 = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Общая').text().find(":")
                    full_area = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Общая').text()[separator1 + 2:]
                    separator2 = full_area.find(" ")
                    area = full_area[:separator2]
                    extra_object.total_area = area
            if advert.category_id in [16, 26]:
                extra_object = ExtraLot()
                if u"Площадь" in categories:
                    separator1 = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Площадь').text().find(":")
                    area = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Площадь').text()[separator1 + 2:]
                    extra_object.total_area = area
                if u"Под ком. заст." or u"Под жил. заст." in categories:
                    granted = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Под').text()
                    extra_object.intended_purpose = granted
            if u"Метро" in categories:
                separator = g.doc.select(
                    "//div[@class='kratkost']/p[contains(.,'%s')]" %
                    u'Метро').text().find("-")
                metro = g.doc.select(
                    "//div[@class='kratkost']/p[contains(.,'%s')]" %
                    u'Метро').text()[7:separator - 2]
                if metro == u'Дворец Спорта':
                    advert.metro_id = 76
                elif metro == u'Дружбы Народов':
                    advert.metro_id = 79
                elif metro == u'Красный Хутор':
                    advert.metro_id = 87
                elif metro == u'Демеевская':
                    advert.metro_id = 66
                elif metro == u'Советской армии':
                    advert.metro_id = 21
                elif metro == u'Маршала Жукова':
                    advert.metro_id = 12
                elif metro == u'Метростроителей им. Ващенко':
                    advert.metro_id = 13
                elif metro == u'им. А.С. Масельского':
                    advert.metro_id = 9
                else:
                    advert.metro_id = METRO_CIUA[metro]
            if g.doc.select("//div[@class='kratkost']").text():
                if DOMEN[self.city][29:] == 'kharkov':
                    separator1 = g.doc.select(
                        "//div[@class='kratkost']").text().find(",")
                    fulladress = g.doc.select(
                        "//div[@class='kratkost']").text()[separator1 + 11:]
                    separator2 = fulladress.find(",")
                    subloc = fulladress[:separator2]
                else:
                    separator1 = g.doc.select(
                        "//div[@class='kratkost']").text().find(",")
                    fulladress = g.doc.select(
                        "//div[@class='kratkost']").text()[separator1 + 2:]
                    separator2 = fulladress.find(",")
                    subloc = fulladress[:separator2]
                advert.sublocality_id = SUB_CIUA[subloc]
            same_adv = Advert.objects.filter(
                category_id=CATEGORIES[addition['category']],
                author_id=self.author_id,
                city_id=self.city_id,
                link=advert.link,
            ).first()
            if same_adv:
                self.stats['omited'] += 1
                if same_adv.date_of_update < (timezone.now() -
                                              datetime.timedelta(hours=20)):
                    same_adv.date_of_update = timezone.now()
                    same_adv.save()
                    self.stats['date_of_update'] += 1
                continue
            advert.save()
            photo_grab = g.clone()
            photo_grab.setup(proxy_auto_change=False, reuse_referer=False)
            img = []
            for i in g.doc.select('//@href'):
                if i.text().startswith("/pic/objects/"):
                    img.append(i.text())
                else:
                    continue
            for photo in img:
                photo_name_except = photo[22:54]
                photo_link = '%s%s' % (DOMEN[self.city][:15], photo)
                photos = []
                photo_links2 = []
                sleep(0.2)
                try:
                    photo_grab.go(photo_link)
                    if photo_grab.response.code == 200 and \
                            re.match('image/', photo_grab.response.headers['Content-Type']):
                        photos.append({
                            'body':
                            photo_grab.response.body,
                            'extention':
                            RE_EXTENTION.search(
                                photo_grab.config['url']).group()
                        })
                except GrabNetworkError as error:
                    photo_links2.append(photo_link)
                photo = Photo(advert_id=advert.id)
                try:
                    file_name = '%s.%s' % (hashlib.md5(
                        photo_grab.config['url']).hexdigest(),
                                           photos[0]['extention'])
                except UnicodeEncodeError:
                    file_name = file_name = '%s.%s' % (hashlib.md5(
                        photo_name_except).hexdigest(), photos[0]['extention'])
                photo.photo.save(file_name, ContentFile(photos[0]['body']))
            if extra_object:
                extra_object.advert = advert
                extra_object.save()
            self.stats['saved'] += 1
Beispiel #3
0
    def task_collect_adv_data(self, grab, task):
        self.stats['taken'] += 1
        one = []
        for i in grab.doc.select('//@href'): 
            one.append(i.text())
            e = re.findall(r'/offers/\d+', ''.join(one))[::3]
        for one_adv in e: 
            extra_object = ExtraHouse()
            self.stats['processed'] += 1
            addition = task.get('addition')
            g = Grab()
            g.go(DOMEN+one_adv)
            advert = Advert()
            advert.category_id = CATEGORIES[addition['category']]
            advert.city_id = self.city_id
            advert.author_id = self.author_id
            advert.link = DOMEN+one_adv
            if g.doc.select('//article[@class="article"]/h1').text():
                title = g.doc.select('//article[@class="article"]/h1').text()
                advert.title = title[:90]
            if g.doc.select('//div[@class="box"]/p').text():
                text = g.doc.select('//div[@class="box"]/p').text()
                advert.main_text = text
            advert.sublocality_id = advert.detect_sublocality_id(self.sublocality_marker)
            advert.metro_id = advert.detect_metro_id(self.metro_marker)
            if g.doc.select('//div[@class="col-xs-6 col-md-5"]').text():
                phone = g.doc.select('//div[@class="col-xs-6 col-md-5"]').text()
                advert.raw_phones = '0'+''.join(re.findall(r'\d+', phone))
            if g.doc.select('//div[@class="col-xs-12 col-md-8"]').text():
                prise = g.doc.select('//div[@class="col-xs-12 col-md-8"]').text()
                advert.price_uah = int(''.join(re.findall(r'\d+', prise)))

            """
            Продолжить тут
            """
            
            text = g.doc.select('//td[@valign="top"][@width="100%"]').text()
            text = text.replace(advert.title, '')
            her = text.replace(advert.main_text, '')
            try:
                extra_object.total_area =re.search(r'\d+', re.search(u'Площадь дома\s+\-\s+\d+',her).group()).group()
            except AttributeError:
                a = 1
            try:
                extra_object.lot_area =re.search(r'\d+', re.search(u'Площадь участка\s+\-\s+\d+',her).group()).group()
            except AttributeError:
                a = 1
            try:
                extra_object.floors =re.search(r'\d+', re.search(u'Этажей\s+\-\s+\d+',her).group()).group()
            except AttributeError:
                a = 1
            same_adv = Advert.objects.filter(
                    category_id=CATEGORIES[addition['category']],
                    author_id=self.author_id,
                    city_id=self.city_id,
                    link=advert.link,
                ).first()
            if same_adv:
                self.stats['omited'] += 1
                if same_adv.date_of_update < (
                            timezone.now() - datetime.timedelta(hours=20)):
                        same_adv.date_of_update = timezone.now()
                        same_adv.save()
                        self.stats['date_of_update'] += 1
                continue
            advert.save()
            img = []
            for i in g.doc.select('//div[@class="thumb"]/a/@href'):
                img.append(i.text())
            for photo in img:
                q = Grab()
                photos = []
                photo_links2 = []
                sleep(0.2)
                try:
                    q.go(photo)
                    if q.response.code == 200 and \
                            re.match('image/', q.response.headers['Content-Type']):
                        photos.append({
                            'body': q.response.body,
                            'extention': RE_EXTENTION.search(q.config['url']).group()
                        })
                except GrabNetworkError as error:
                    photo_links2.append(photo)
                photo = Photo(advert_id=advert.id)
                try:
                    file_name = '%s.%s' % (
                        hashlib.md5(q.config['url']).hexdigest(),
                        photos[0]['extention']
                        )
                except IndexError:
                    continue
                photo.photo.save(file_name, ContentFile(photos[0]['body']))
            if extra_object:
                extra_object.advert = advert
                extra_object.save()
            self.stats['saved'] += 1
Beispiel #4
0
    def task_collect_adv_data(self, grab, task):
        # print "////////collect adv data////////////"
        # if there is no phone it doesn't make sense to take other data
        sleep(1)
        self.stats['taken'] += 1
        if grab.doc.select("//li[%s]" % xpcs('link-phone')):
            addition = task.get('addition')

            # print '////////create new Advert object///////////'
            advert = Advert()
            advert.category_id = CATEGORIES[addition['category']]
            advert.city_id = self.city_id
            advert.author_id = self.author_id
            advert.link = RE_ADV_LINK.search(grab.config['url']).group()
            advert.title = grab.doc.select("//h1").text()
            price = grab.doc.select("//div[%s]/strong" %
                                    xpcs('pricelabel')).text()
            currency = 'uah' if u'грн' in price else 'usd'
            if currency == 'uah':
                advert.price_uah = int(RE_NON_DIGIT.sub('', price))
                advert.price_usd = advert.price_uah / USD_UAH
            else:
                advert.price_usd = int(RE_NON_DIGIT.sub('', price))
                advert.price_uah = advert.price_usd * USD_UAH
            # think about location
            advert.main_text = grab.doc.select(
                "//div[@id='textContent']/p").text()
            address = grab.doc.select("//span[%s]/strong" %
                                      xpcs('show-map-link')).text().split(',')
            if len(address) > 3:
                advert.street = address[3]
            if self.city_id == 8 and len(address) >= 3:
                kiev_big_subloc = BigSublocality.objects.filter(
                    name__startswith=address[2].strip()).first()
                if kiev_big_subloc:
                    advert.big_sublocality_id = kiev_big_subloc.id

            extra_action = None
            if advert.category_id in (21, 22, 24, 26, 27):
                extra_action = ExtraRent()
            if advert.category_id in (11, 12, 21, 22):
                extra_object = ExtraFlat()
            if advert.category_id in (14, 24):
                extra_object = ExtraHouse()
            if advert.category_id in (16, 26):
                extra_object = ExtraLot()
            if advert.category_id in (17, 27):
                extra_object = ExtraCommercial()

            if addition['category'] in ('arenda-kvartir', 'prodazha-kvartir',
                                        'prodazha-komnat'):
                rooms_number = grab.doc.select(
                    "//table[@class='item'][contains(., '%s')]" %
                    u'Количество комнат')
                if rooms_number:
                    extra_object.rooms_number = RE_DIGIT.search(
                        rooms_number.select(
                            ".//td[@class='value']").text()).group()
                total_area = grab.doc.select(
                    "//table[@class='item'][contains(., '%s')]" %
                    u'Общая площадь')
                if total_area:
                    extra_object.total_area = RE_DIGIT.search(
                        total_area.select(
                            ".//td[@class='value']").text()).group()
                floor = grab.doc.select(
                    "//table[@class='item'][contains(., '%s')]" % u'Этаж')
                if floor:
                    extra_object.floor = RE_DIGIT.search(
                        floor.select(".//td[@class='value']").text()).group()
                floors = grab.doc.select(
                    "//table[@class='item'][contains(., '%s')]" %
                    u'Этажность дома')
                if floors:
                    extra_object.floors = RE_DIGIT.search(
                        floors.select(".//td[@class='value']").text()).group()

            if 'arenda' in addition['category']:
                rent_term = grab.doc.select(
                    "//table[@class='item'][contains(., '%s')]" %
                    u'Тип аренды')
                rent_term = rent_term.select(".//td[@class='value']").text(
                ) if rent_term else u'Долгосрочная аренда'
                extra_action.term = 2 if u'Долгосрочная' in rent_term else 1

            if addition['category'] == 'arenda-komnat':
                rooms_number = grab.doc.select(
                    "//table[@class='item'][contains(., '%s')]" %
                    u'Всего комнат')
                if rooms_number:
                    extra_object.rooms_number = RE_DIGIT.search(
                        rooms_number.select(
                            ".//td[@class='value']").text()).group()

            if addition['category'] in ('arenda-domov', 'prodazha-domov'):
                total_area = grab.doc.select(
                    "//table[@class='item'][contains(., '%s')]" %
                    u'Площадь дома')
                if total_area:
                    extra_object.total_area = RE_DIGIT.search(
                        total_area.select(
                            ".//td[@class='value']").text()).group()

            if addition['category'] == 'prodazha-domov':
                house_type = grab.doc.select(
                    "//table[@class='item'][contains(., '%s')]" % u'Тип дома')
                house_type = house_type.select(
                    ".//td[@class='value']").text() if house_type else None
                extra_object.house_type = 2 if house_type == u'Продажа дач' else 1

            if addition['category'] == 'prodazha-kvartir':
                building = grab.doc.select(
                    "//table[@class='item'][contains(., '%s')]" %
                    u'Тип квартиры')
                building = building.select(
                    ".//td[@class='value']").text() if building else ''
                if u'Новостройки' in building:
                    extra_object.new_building = True

            if addition['category'] == 'prodazha-zemli':
                lot_purpose = grab.doc.select(
                    "//table[@class='item'][contains(., '%s')]" %
                    u'Тип участка')
                lot_purpose = lot_purpose.select(
                    ".//td[@class='value']").text() if lot_purpose else ''
                if u'сад / огород' in lot_purpose:
                    extra_object.intended_purpose = 'садоводство'
                elif u'индивидуальное строительство' in lot_purpose:
                    extra_object.intended_purpose = 'под застройку'
                elif u'сельскохозяйственного назначения' in lot_purpose:
                    extra_object.intended_purpose = u'ОСГ(особисте селянське господарство)'
                elif u'промышленного назначения' in lot_purpose:
                    extra_object.intended_purpose = u'коммерческого назначения'
                lot_area = grab.doc.select(
                    "//table[@class='item'][contains(., '%s')]" % u'Площадь')
                lot_area = RE_DIGIT.search(
                    lot_area.select(".//td[@class='value']").text()).group(
                    ) if lot_area else None
                if lot_area:
                    extra_object.lot_unit = u'соток'

            photo_links = grab.doc.select("//img[%s]" % xpcs('bigImage'))
            photos = []
            # print '//////amount of photos %s/////////' % len(photo_links)
            if photo_links:
                photo_grab = grab.clone()
                photo_grab.setup(proxy_auto_change=False, reuse_referer=False)
                sleep(0.2)
                photo_links2 = []
                for photo_link in photo_links:
                    try:
                        photo_grab.go(photo_link.attr('src'))
                        if photo_grab.response.code == 200 and \
                               re.match('image/', photo_grab.response.headers['Content-Type']):
                            photos.append({
                                'body':
                                photo_grab.response.body,
                                'extention':
                                RE_EXTENTION.search(
                                    photo_grab.config['url']).group()
                            })
                    except GrabNetworkError as error:
                        # print('////error while taking photo////')
                        photo_links2.append(photo_link)
                # print('////one more try///')
                # print(len(photo_links2))
                for photo_link in photo_links2:
                    photo_grab.go(photo_link.attr('src'))
                    if photo_grab.response.code == 200 and \
                           re.match('image/', photo_grab.response.headers['Content-Type']):
                        photos.append({
                            'body':
                            photo_grab.response.body,
                            'extention':
                            RE_EXTENTION.search(
                                photo_grab.config['url']).group()
                        })

            phone_raw = self.take_phone(grab)
            phone_in_text = advert.detect_phone()
            if phone_raw and phone_in_text:
                phone_raw = phone_raw + ',' + ','.join(phone_in_text)
            elif phone_in_text:
                phone_raw = ','.join(phone_in_text)
            if phone_raw:
                advert.raw_phones = phone_raw
                subloc = None
                sub_if = grab.doc.select("//strong[@class='c2b small']").text()
                for sub_one in SUB_IF:
                    if sub_one in sub_if:
                        subloc = sub_one
                if subloc is not None:
                    advert.sublocality_id = int(SUB_IF[subloc])
                else:
                    advert.sublocality_id = advert.detect_sublocality_id(
                        self.sublocality_marker)
                if self.metro_marker:
                    advert.metro_id = advert.detect_metro_id(self.metro_marker)
                # print '//////////SAVE ADVERT/////////'
                advert.save()
                self.stats['saved'] += 1
                for i, img in enumerate(photos):
                    photo = Photo(advert_id=advert.id)
                    file_name = '%s.%s' % (hashlib.md5(grab.config['referer'] +
                                                       str(i)).hexdigest(),
                                           img['extention'])
                    photo.photo.save(file_name, ContentFile(img['body']))
                if self.extra_has_values(extra_object):
                    extra_object.advert = advert
                    extra_object.save()
                if extra_action:
                    extra_action.advert = advert
                    extra_action.save()
        else:
            self.stats['without_phone'] += 1
Beispiel #5
0
    def task_collect_adv_data(self, grab, task):
        self.stats['taken'] += 1
        one = []
        url_adv = []
        for i in grab.doc.select('//@href'):
            one.append(i.text())
        for item in one:
            if item.startswith('view.php?ad_id=') and item not in url_adv:
                url_adv.append(item)
            else:
                continue
        for one_adv in url_adv:
            self.stats['processed'] += 1
            addition = task.get('addition')
            advert = Advert()
            if self.city == 'kharkov':
                advert.category_id = CATEGORIES_khar[addition['category']]
            else:
                advert.category_id = CATEGORIES_kiev[addition['category']]
            advert.city_id = self.city_id
            advert.author_id = self.author_id
            advert.link = DOMEN[:13] + one_adv
            g = Grab()
            g.go(DOMEN[:13] + "print_" + one_adv)
            if g.doc.select("//td/p[contains(.,'%s')]" % u'Тел:').text():
                phones = g.doc.select("//td/p[contains(.,'%s')]" %
                                      u'Тел:').text()[5:]
                advert.raw_phones = phones
            g.go(DOMEN[:13] + one_adv)

            categories = g.doc.select(
                '//div[@style="font-size: 11px;"]').text()

            if g.doc.select('//h1').text():
                title = g.doc.select('//h1').text()
                advert.title = title
            if g.doc.select('//p[@class="ad-price"]').text():
                numlist = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
                price = ""
                for i in g.doc.select('//p[@class="ad-price"]').text():
                    if i in numlist:
                        price += i
                    else:
                        continue
                if u'грн' in g.doc.select('//p[@class="ad-price"]').text():
                    if price:
                        advert.price_uah = int(price)
                else:
                    if price:
                        advert.price_usd = int(price)
            if g.doc.select('//p[@class="ad-desc"]').text():
                text = g.doc.select('//p[@class="ad-desc"]').text()
                advert.main_text = text
            # if g.doc.select("//div[@style='font-size: 11px;']/p[contains(.,'%s')]" %u'Контакт').text():
            #     contact = g.doc.select("//div[@style='font-size: 11px;']/p[contains(.,'%s')]" %u'Контакт').text()[9:]
            #     advert.contact_name = contact
            if advert.category_id in [21, 11, 27, 17]:
                extra_object = ExtraFlat()
                if u"Этаж" in categories:
                    separator1 = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Этаж').text().find(":")
                    both = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Этаж').text()[separator1 + 2:]
                    separator2 = both.find("/")
                    floors = both[separator2 + 2:]
                    floor = both[:separator2 - 1]
                    extra_object.floors = floors
                    extra_object.floor = floor
                if u"Комнат" in categories:
                    rooms_number = g.doc.select(
                        '//p[@class="ad-contacts"]').text()[-1]
                    extra_object.rooms_number = rooms_number
                if u"Общая" in categories:
                    separator = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Общая площадь').text().find(":")
                    area = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Общая площадь').text()[separator + 2:-6]
                    extra_object.total_area = area
            if advert.category_id in [14, 24]:
                extra_object = ExtraHouse()
                if u"Этажность" in categories:
                    floors = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Этаж').text()[-1]
                    extra_object.floors = floors
                if u"Площадь дома" in categories:
                    separator = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Площадь дома').text().find(":")
                    area = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Площадь дома').text()[separator + 2:-6]
                    extra_object.total_area = area
                if u"Площадь участка" in categories:
                    separator = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Площадь участка').text().find(":")
                    area = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Площадь участка').text()[separator + 2:]
                    extra_object.lot_area = area
            if advert.category_id in [16]:
                extra_object = ExtraLot()
                if u"Площадь участка" in categories:
                    separator1 = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Площадь участка').text().find(":")
                    area = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Площадь участка').text()[separator1 + 2:]
                    extra_object.total_area = area
                if u"Под строительство" in categories:
                    granted = g.doc.select(
                        "//ul[@style='list-style-type: none']/li[contains(.,'%s')]"
                        % u'Под').text()
                    extra_object.intended_purpose = granted
            if g.doc.select("//h3").text():
                if "," in g.doc.select("//h3").text():
                    subloc = g.doc.select("//h3").text()
                else:
                    separator = g.doc.select("//h3").text().find(" ")
                    subloc = g.doc.select("//h3").text()[:separator]
                advert.sublocality_id = SUB_FN[subloc]
            advert.metro_id = advert.detect_metro_id(self.metro_marker)
            same_adv = Advert.objects.filter(
                category_id=advert.category_id,
                author_id=self.author_id,
                city_id=self.city_id,
                link=advert.link,
            ).first()
            if same_adv:
                self.stats['omited'] += 1
                if same_adv.date_of_update < (timezone.now() -
                                              datetime.timedelta(hours=20)):
                    same_adv.date_of_update = timezone.now()
                    same_adv.save()
                    self.stats['date_of_update'] += 1
                continue
            advert.save()
            photo_grab = g.clone()
            photo_grab.setup(proxy_auto_change=False, reuse_referer=False)
            img = []
            for i in g.doc.select('//@href'):
                if i.text().startswith("./upload/pics/"):
                    img.append(i.text())
                else:
                    continue
            for photo in img:
                photo_name_except = photo[14:-4]
                photo_link = '%s%s' % (DOMEN[:13], photo)
                photos = []
                photo_links2 = []
                sleep(0.2)
                try:
                    photo_grab.go(photo_link)
                    if photo_grab.response.code == 200 and \
                            re.match('image/', photo_grab.response.headers['Content-Type']):
                        photos.append({
                            'body':
                            photo_grab.response.body,
                            'extention':
                            RE_EXTENTION.search(
                                photo_grab.config['url']).group()
                        })
                except GrabNetworkError as error:
                    photo_links2.append(photo_link)
                photo = Photo(advert_id=advert.id)
                try:
                    file_name = '%s.%s' % (hashlib.md5(
                        photo_grab.config['url']).hexdigest(),
                                           photos[0]['extention'])
                except UnicodeEncodeError:
                    file_name = file_name = '%s.%s' % (hashlib.md5(
                        photo_name_except).hexdigest(), photos[0]['extention'])
                photo.photo.save(file_name, ContentFile(photos[0]['body']))
            if extra_object:
                extra_object.advert = advert
                extra_object.save()
            self.stats['saved'] += 1
Beispiel #6
0
 def create_advert(self, raw):
     grab = Grab()
     self.stats['taken'] += 1
     self.stats['processed'] += 1
     # general fields
     extra_object = None
     adv = Advert()
     adv.city_id = self.city_id
     adv.author_id = self.author_id
     if 'priceArr' in raw:
         if raw['priceArr']['3']:
             price = re.sub(r'\s', '', raw['priceArr']['3'])
             adv.price_uah = int(price)
         else:
             adv.price_uah = 1
     if 'description' in raw:
         adv.main_text = raw['description']
     if 'user' in raw:
         adv.contact_name = raw['user']['name']
     if 'user_id' in raw:
         us_id = raw['user_id']
         p = Grab()
         p.go(URLS_ID % us_id)
         main = p.doc.select("//li[@class='fieldWrap']").text()
         adv.raw_phones = re.sub(r'\s|\(|\)|\-', '', main)
     if 'street_name' in raw:
         adv.street = raw['street_name']
     if 'beautiful_url' in raw:
         url_domria = raw['beautiful_url']
         adv.link = (LINK_DOM % url_domria)
     if raw['advert_type_id'] in [1]:
         if raw['realty_type_name'] in [u'квартира', u'Квартира']:
             extra_object = ExtraFlat()
             adv.category_id = CATEGORIES['prodazha-kvartir']
             titles = u'Продажа квартиры %s'
             if 'floors_count' in raw:
                 extra_object.floors = raw['floors_count']
             if 'floor' in raw:
                 extra_object.floor = raw['floor']
             if 'rooms_count' in raw:
                 extra_object.rooms_number = raw['rooms_count']
             if 'total_square_meters' in raw:
                 extra_object.total_area = raw['total_square_meters']
         if raw['realty_type_name'] in [u'дом', u'Дом']:
             extra_object = ExtraHouse()
             adv.category_id = CATEGORIES['prodazha-domov']
             titles = u'Продажа дома %s'
             if 'total_square_meters' in raw:
                 extra_object.total_area = raw['total_square_meters']
     if raw['advert_type_id'] in [3, 4]:
         if raw['realty_type_name'] in [u'квартира', u'Квартира']:
             extra_object = ExtraFlat()
             adv.category_id = CATEGORIES['arenda-kvartir']
             titles = u'Аренда квартиры %s'
             if 'floors_count' in raw:
                 extra_object.floors = raw['floors_count']
             if 'floor' in raw:
                 extra_object.floor = raw['floor']
             if 'rooms_count' in raw:
                 extra_object.rooms_number = raw['rooms_count']
             if 'total_square_meters' in raw:
                 extra_object.total_area = raw['total_square_meters']
         if raw['realty_type_name'] in [u'дом', u'Дом']:
             extra_object = ExtraHouse()
             adv.category_id = CATEGORIES['arenda-domov']
             titles = u'Аренда дома %s'
             if 'total_square_meters' in raw:
                 extra_object.total_area = raw['total_square_meters']
     if 'district_name' in raw:
         sudlo_name = int(SUB_CIUA[raw['district_name']])
         adv.sublocality_id = sudlo_name
         adv.title = (titles % adv.sublocality.name)
     if 'district_name' not in raw:
         if 'street_name' in raw:
             adv.title = (titles % raw['street_name'])
         else:
             adv.title = (titles % self.city)
     if self.metro_marker:
         if 'metro_station_name' in raw:
             metro_station = METRO_CIUA[raw['metro_station_name']]
             adv.metro_id = metro_station
     adv.save()
     photo_grab = grab.clone()
     photo_grab.setup(proxy_auto_change=False, reuse_referer=False)
     for key in raw['photos']:
         key_photo = raw['photos'][key]['file']
         photo_link = (PHOTO_URL % (key_photo.replace('.', 'f.')))
         photos = []
         sleep(0.2)
         try:
             photo_grab.go(photo_link)
             if photo_grab.response.code == 200 and \
                     re.match('image/', photo_grab.response.headers['Content-Type']):
                 photos.append({
                     'body':
                     photo_grab.response.body,
                     'extention':
                     RE_EXTENTION.search(photo_grab.config['url']).group()
                 })
         except GrabNetworkError as error:
             photo_links2.append(photo_link)
         photo = Photo(advert_id=adv.id)
         try:
             file_name = '%s.%s' % (hashlib.md5(
                 photo_grab.config['url']).hexdigest(),
                                    photos[0]['extention'])
             photo.photo.save(file_name, ContentFile(photos[0]['body']))
         except IndexError:
             pass
     if extra_object:
         extra_object.advert = adv
         extra_object.save()
     self.stats['saved'] += 1
Beispiel #7
0
 def task_collect_adv_data(self, grab, task):
     self.stats['taken'] += 1
     one = []
     for i in grab.doc.select('//@href'):
         one.append(i.text())
     print one
     print grab.response.url
     url_adv = re.findall(r'view\.\w+\?\w+\=\d+', ','.join(one))
     for one_adv in url_adv:
         self.stats['processed'] += 1
         addition = task.get('addition')
         g = Grab()
         g.go(DOMEN[self.city] + one_adv)
         advert = Advert()
         advert.category_id = CATEGORIES[addition['category']]
         advert.city_id = self.city_id
         advert.author_id = self.author_id
         advert.link = re.sub(r'www.', "", DOMEN[self.city] + one_adv)
         if g.doc.select('//div[@class="page-header"]').text():
             title = g.doc.select('//div[@class="page-header"]').text()
             advert.title = title[:90]
         if g.doc.select('//p[@style="margin-top: 0;"]').text():
             text = g.doc.select('//p[@style="margin-top: 0;"]').text()
             advert.main_text = text
         adv = g.doc.select('//p[@class="phone"]').text()
         for i in SUB_AVIS:
             if i in adv:
                 advert.sublocality_id = SUB_AVIS[i]
         advert.metro_id = advert.detect_metro_id(self.metro_marker)
         if u'Цена:' in adv:
             if u'грн' in adv:
                 pr = re.search(u'Цена: \d+\s+\d+', adv) or re.search(
                     u'Цена: \d+', adv)
                 try:
                     room = re.sub(u'Цена: ', "", pr.group())
                     advert.price_uah = int(re.sub(' ', "", room))
                 except AttributeError:
                     advert.price_uah = 1
             if u'у.е.' in adv:
                 pr = re.search(u'Цена: \d+\s+\d+', adv) or re.search(
                     u'Цена: \d+', adv)
                 try:
                     room = re.sub(u'Цена: ', "", pr.group())
                     advert.price_usd = int(re.sub(' ', "", room))
                 except AttributeError:
                     advert.price_usd = 1
         if u'Тел:' in adv:
             phones = re.sub(r'[\s\-\(\)]', '', adv)
             phones = re.search(r'\d{9,10}(?=\D|$)', phones)
             advert.raw_phones = phones.group()
         if CATEGORIES[addition['category']] in [11, 12, 21, 22]:
             extra_object = ExtraFlat()
             if u'Комнат:' in adv:
                 lol = re.search(u'Комнат: \d+', adv)
                 room = re.sub(u'Комнат: ', "", lol.group())
                 extra_object.rooms_number = int(room)
         if CATEGORIES[addition['category']] in [14, 24]:
             extra_object = ExtraHouse()
         if CATEGORIES[addition['category']] in [16]:
             extra_object = ExtraLot()
         same_adv = Advert.objects.filter(
             category_id=CATEGORIES[addition['category']],
             author_id=self.author_id,
             city_id=self.city_id,
             link=advert.link,
         ).first()
         if same_adv:
             self.stats['omited'] += 1
             if same_adv.date_of_update < (timezone.now() -
                                           datetime.timedelta(hours=20)):
                 same_adv.date_of_update = timezone.now()
                 same_adv.save()
                 self.stats['date_of_update'] += 1
             continue
         advert.save()
         img = []
         for i in g.doc.select(
                 '//li[@class="span2"]/a[@class="thumbnail"]/@href'):
             img.append(i.text()[2:])
         for i in g.doc.select('//div[@class="item active"]/img/@src'):
             img.append(i.text()[2:])
         for photo in img:
             q = Grab()
             photos = []
             photo_links2 = []
             sleep(0.2)
             try:
                 q.go(photo)
                 if q.response.code == 200 and \
                         re.match('image/', q.response.headers['Content-Type']):
                     photos.append({
                         'body':
                         q.response.body,
                         'extention':
                         RE_EXTENTION.search(q.config['url']).group()
                     })
             except GrabNetworkError as error:
                 photo_links2.append(photo)
             photo = Photo(advert_id=advert.id)
             file_name = '%s.%s' % (hashlib.md5(
                 q.config['url']).hexdigest(), photos[0]['extention'])
             photo.photo.save(file_name, ContentFile(photos[0]['body']))
         if extra_object:
             extra_object.advert = advert
             extra_object.save()
         self.stats['saved'] += 1
Beispiel #8
0
 def task_collect_adv_data(self, grab, task):
     self.stats['taken'] += 1
     one = []
     for i in grab.doc.select('//a[@class="avstd"]/@href'):
         one.append(i.text())
     for one_adv in one:
         sleep(0.5)
         self.stats['processed'] += 1
         addition = task.get('addition')
         g = Grab()
         g.go(one_adv)
         advert = Advert()
         advert.category_id = CATEGORIES[addition['category']]
         advert.city_id = self.city_id
         advert.author_id = self.author_id
         advert.link = one_adv
         advert.title = g.doc.select('//h1').text()[:90]
         text = []
         for i in g.doc.select('//td[@colspan="2"]'):
             text.append(i.text())
         for i, img in enumerate(text):
             if u'Дополнительно : ' in img:
                 advert.main_text = re.sub(u'Дополнительно : ', '', text[i])
         advert.sublocality_id = advert.detect_sublocality_id(
             self.sublocality_marker)
         advert.metro_id = advert.detect_metro_id(self.metro_marker)
         prise = g.doc.select('//font[@size="3"]/b').text()
         price_uah = re.sub(r' ', '', prise)
         advert.price_uah = int(price_uah)
         mayn = []
         for i in g.doc.select(
                 '//table[@border="0"][@cellpadding="2"][@cellspacing="0"][@align="center"][@width="100%"]/tr'
         ):
             mayn.append(i.text())
         phone = re.findall(u'Teлефоны : \d+\-\d+\-\d+', ''.join(mayn))
         phone2 = re.findall(u'Teлефон : \d+\-\d+\-\d+', ''.join(mayn))
         phone3 = re.findall(u'Teлефоны : \d+\-\d+\, \d+\-\d+\-\d+',
                             ''.join(mayn))
         if phone:
             advert.raw_phones = ''.join(re.findall(r'\d+', ''.join(phone)))
         if phone2:
             advert.raw_phones = ''.join(re.findall(r'\d+',
                                                    ''.join(phone2)))
         if phone3:
             phones = re.sub(r'\s|-', '', ''.join(phone3))
             advert.raw_phones = ''.join(re.findall(r'\d{8,12}', phones))
         objects = []
         for i in g.doc.select(
                 '//table[@border="0"][@cellpadding="5"][@cellspacing="0"][@width="100%"]'
         ):
             objects.append(i.text())
         if CATEGORIES[addition['category']] in [11, 21]:
             extra_object = ExtraFlat()
             for i, img in enumerate(objects):
                 if u'Комнат / тип: ' in img:
                     rooms_number = re.findall(u'Комнат / тип: \d+',
                                               objects[i])
                     extra_object.rooms_number = ''.join(
                         re.findall(r'\d+', ''.join(rooms_number)))
                 if u'Этаж/этажность: ' in img:
                     floor = re.findall(u'Этаж/этажность: \d+', objects[i])
                     extra_object.floor = ''.join(
                         re.findall(r'\d+', ''.join(floor)))
                 if u' общая' in img:
                     total_area = re.findall(u'\d+ \- общая', objects[i])
                     extra_object.total_area = ''.join(
                         re.findall(r'\d+', ''.join(total_area)))
         if CATEGORIES[addition['category']] in [14, 24]:
             extra_object = ExtraHouse()
             for i, img in enumerate(objects):
                 if u' общая' in img:
                     total_area = re.findall(u'\d+ \- общая', objects[i])
                     extra_object.total_area = ''.join(
                         re.findall(r'\d+', ''.join(total_area)))
                 if u'Этажность ' in img:
                     floor = re.findall(u'Этажность \d+', objects[i])
                     extra_object.floor = ''.join(
                         re.findall(r'\d+', ''.join(floor)))
         if CATEGORIES[addition['category']] in [16]:
             extra_object = ExtraLot()
             for i, img in enumerate(objects):
                 if u'Площадь : ' in img:
                     total_area = re.findall(u'Площадь : \d+', objects[i])
                     extra_object.total_area = ''.join(
                         re.findall(r'\d+', ''.join(total_area)))
         sleep(0.5)
         same_adv = Advert.objects.filter(
             category_id=CATEGORIES[addition['category']],
             author_id=self.author_id,
             city_id=self.city_id,
             link=advert.link,
         ).first()
         if same_adv:
             self.stats['omited'] += 1
             if same_adv.date_of_update < (timezone.now() -
                                           datetime.timedelta(hours=20)):
                 same_adv.date_of_update = timezone.now()
                 same_adv.save()
                 self.stats['date_of_update'] += 1
             continue
         advert.save()
         img = []
         for i in g.doc.select(
                 '//td[@class="tBrd1p"][@align="center"][@valign="middle"]/a/img/@src'
         ):
             img.append(re.sub(r'\?t=\S+', '', i.text()))
         for photo in img:
             q = Grab()
             photo_link = '%s%s' % (DOMEN, photo)
             photos = []
             photo_links2 = []
             sleep(0.2)
             try:
                 q.go(photo_link)
                 if q.response.code == 200 and \
                         re.match('image/', q.response.headers['Content-Type']):
                     photos.append({
                         'body':
                         q.response.body,
                         'extention':
                         RE_EXTENTION.search(q.config['url']).group()
                     })
             except GrabNetworkError as error:
                 photo_links2.append(photo)
             photo = Photo(advert_id=advert.id)
             file_name = '%s.%s' % (hashlib.md5(
                 q.config['url']).hexdigest(), photos[0]['extention'])
             photo.photo.save(file_name, ContentFile(photos[0]['body']))
         if extra_object:
             extra_object.advert = advert
             extra_object.save()
         self.stats['saved'] += 1