Ejemplo n.º 1
0
def read_ad_details(ad_id):
    ad = EstateAd.objects.get(pk=ad_id)

    # Now download ad
    if ad.link is None: return

    print "-- Detail: " + ad.link
    ad_html = get_site(ad.link)
    tree = pq(ad_html)

    gallery_links = tree.find('#galerija a')
    for link in gallery_links:
        try:
            image = AdPicture(picture_url=link.attrib["href"])
            image.ad = ad
            image.save()
        except KeyError:  # Missing href
            continue

    ad.description = tree.find('.web-opis').text()

    try:
        ad.administrative_unit = tree.find('.more_info').text().split(
            ' | ')[3].lstrip('Upravna enota:').strip()
    except IndexError:
        pass

    if ad.administrative_unit is None:
        ad.administrative_unit = ""

    try:
        ad.county = tree.find('.more_info').text().split(' | ')[4].lstrip(
            u'Ob\u010dina:').strip()
    except IndexError:
        pass

    if ad.county is None:
        ad.county = ""

    ad.raw_detail_html = ad_html

    ad.save()
Ejemplo n.º 2
0
def read_ad_details(ad_id):
    ad = EstateAd.objects.get(pk=ad_id)

    # Now download ad
    if ad.link is None: return

    print "-- Detail: " + ad.link
    ad_html = get_site(ad.link)
    tree = etree.fromstring(ad_html, etree.HTMLParser())

    gallery_links = tree.xpath('//div[@id="galerija"]/a')
    for link in gallery_links:
        try:
            image = AdPicture(picture_url=link.attrib["href"])
            image.ad = ad
            image.save()
        except KeyError:    # Missing href
            continue

    ad.description = "\n".join(tree.xpath('//div[@class="web-opis"]//text()'))

    try:
        ad.administrative_unit = tree.xpath('//div[@class="main-data"]/table/tr')[3].getchildren()[1].text
    except IndexError:
        pass

    if ad.administrative_unit is None:
        ad.administrative_unit = ""

    try:
        ad.county = tree.xpath('//div[@class="main-data"]/table/tr')[4].getchildren()[1].text
    except IndexError:
        pass

    if ad.county is None:
        ad.county = ""

    ad.raw_detail_html = ad_html

    ad.save()
Ejemplo n.º 3
0
def read_ad_details(ad_id):
    ad = EstateAd.objects.get(pk=ad_id)

    # Now download ad
    if ad.link is None: return

    print "-- Detail: " + ad.link
    ad_html = get_site(ad.link)
    tree = pq(ad_html)

    gallery_links = tree.find('#galerija a')
    for link in gallery_links:
        try:
            image = AdPicture(picture_url=link.attrib["href"])
            image.ad = ad
            image.save()
        except KeyError:    # Missing href
            continue

    ad.description = tree.find('.web-opis').text()

    try:
        ad.administrative_unit = tree.find('.more_info').text().split(' | ')[3].lstrip('Upravna enota:').strip()
    except IndexError:
        pass

    if ad.administrative_unit is None:
        ad.administrative_unit = ""

    try:
        ad.county = tree.find('.more_info').text().split(' | ')[4].lstrip(u'Ob\u010dina:').strip()
    except IndexError:
        pass

    if ad.county is None:
        ad.county = ""

    ad.raw_detail_html = ad_html

    ad.save()
Ejemplo n.º 4
0
    def handle(self, *args, **options):

        parse_queue = Queue()

        for region_num, region_name in REGIONS:
            print " == " + region_name + " == "
            parse_queue.put(TOP_SITE_URL + "&r=" + str(region_num))

            while not parse_queue.empty():
                url = parse_queue.get()
                print "Parsing " + url
                site_html = get_site(url)
                tree = pq(site_html)

                raw_ads = tree('.oglas_container')

                for raw_ad in raw_ads:
                    doc = pq(raw_ad)
                    ad_id = raw_ad.attrib["id"]
                    if EstateAd.objects.filter(ad_id=ad_id).exists():
                        continue

                    ad = EstateAd()
                    ad.region = region_num
                    ad.publish_date = timezone.now()    # We're parsing last 24 hours so set publish date to now
                    ad.ad_id = ad_id
                    ad.title = doc.find('h2 a .title').text()
                    ad.link = BASE_URL + doc.find('h2 a')[0].attrib["href"]

                    data = doc.find('.main-data span')

                    raw_data = {}
                    for data_span in data:
                        name = data_span.attrib["class"]
                        value = data_span.text
                        raw_data[name] = value

                    raw_data['posr'] = doc.find('.posr').remove('.new-label').text()

                    raw_attributes = doc.find('.atributi span')
                    for raw_attribute in raw_attributes:
                        name = raw_attribute.text[:raw_attribute.text.find(':')].lower()
                        value = raw_attribute.find("strong").text
                        raw_data[name] = value

                    ad.raw_data = json.dumps(raw_data)
                    ad.type, ad.building_type = self.parse_type(raw_data)
                    ad.size_m2 = self.parse_size(raw_data)
                    ad.price, ad.price_m2 = self.parse_price(raw_data, ad.size_m2)

                    ad.year_built = self.parse_year(raw_data)
                    ad.floor = raw_data.get("nadstropje", "")

                    ad.short_description = doc.find('.kratek')[0].text
                    ad.author_name = doc.find('.povezave div')[0].attrib["title"]
                    ad.raw_html = etree.tostring(raw_ad)

                    try:
                        ad.save()
                    except IntegrityError as e:
                        print e
                        print "Ad with id %s already exists in database!" % (ad.ad_id, )

                    _tasks.read_ad_details(ad.pk)

                # Grab next page link
                try:
                    next_page_link = BASE_URL + tree.find('#pagination .next')[0].attrib["href"]
                    parse_queue.put(next_page_link)
                except:
                    pass
Ejemplo n.º 5
0
    def handle(self, *args, **options):

        parse_queue = Queue()

        for region_num, region_name in REGIONS:
            print " == " + region_name + " == "
            parse_queue.put(TOP_SITE_URL + "&r=" + str(region_num))

            while not parse_queue.empty():
                url = parse_queue.get()
                print "Parsing " + url
                site_html = get_site(url)
                tree = etree.fromstring(site_html, etree.HTMLParser())

                raw_ads = tree.xpath('//body/div/div/div[@id="content"]//div[@class="oglas_container"]')
                for raw_ad in raw_ads:
                    ad_id = raw_ad.attrib["id"]
                    if EstateAd.objects.filter(ad_id=ad_id).exists():
                        continue

                    ad = EstateAd()
                    ad.region = region_num
                    ad.publish_date = timezone.now()    # We're parsing last 24 hours so set publish date to now
                    ad.ad_id = ad_id
                    ad.title = raw_ad.xpath('div[@class="teksti_container"]/h2/a')[0].text
                    ad.link = BASE_URL + raw_ad.xpath('div[@class="teksti_container"]/h2/a')[0].attrib["href"]

                    data = raw_ad.xpath('div[@class="teksti_container"]/div[@class="main-data"]/span')

                    raw_data = {}
                    for data_span in data:
                        name = data_span.attrib["class"]
                        value = data_span.text
                        raw_data[name] = value

                    raw_attributes = raw_ad.xpath('div[@class="teksti_container"]/div[@class="atributi"]/span')
                    for raw_attribute in raw_attributes:
                        name = raw_attribute.text[:raw_attribute.text.find(':')].lower()
                        value = raw_attribute.find("strong").text
                        raw_data[name] = value

                    ad.raw_data = json.dumps(raw_data)
                    ad.type, ad.building_type = self.parse_type(raw_data)
                    ad.size_m2 = self.parse_size(raw_data)
                    ad.price, ad.price_m2 = self.parse_price(raw_data, ad.size_m2)

                    ad.year_built = self.parse_year(raw_data)
                    ad.floor = raw_data.get("nadstropje", "")

                    ad.short_description = raw_ad.xpath('div[@class="teksti_container"]/div[@class="kratek"]')[0].text
                    ad.author_name = raw_ad.xpath('div[@class="teksti_container"]/div[@class="povezave"]/div')[0].attrib["title"]
                    ad.raw_html = etree.tostring(raw_ad)

                    try:
                        ad.save()
                    except IntegrityError as e:
                        print e
                        print "Ad with id %s already exists in database!" % (ad.ad_id, )

                    _tasks.read_ad_details(ad.pk)

                # Grab next page link
                try:
                    next_page_link = BASE_URL + tree.xpath('//div[@id="pagination" and @class="fr"]/ul/li/a[@class="next"]')[0].attrib["href"]
                    parse_queue.put(next_page_link)
                except:
                    pass
Ejemplo n.º 6
0
    def handle(self, *args, **options):

        parse_queue = Queue()

        for region_num, region_name in REGIONS:
            print(" == " + region_name + " == ")
            parse_queue.put(TOP_SITE_URL + "&r=" + str(region_num))

            while not parse_queue.empty():
                url = parse_queue.get()
                print("Parsing " + url)
                site_html = get_site(url)
                tree = pq(site_html)

                raw_ads = tree('.oglas_container')

                for raw_ad in raw_ads:
                    doc = pq(raw_ad)
                    ad_id = raw_ad.attrib["id"]
                    if EstateAd.objects.filter(ad_id=ad_id).exists():
                        continue

                    ad = EstateAd()
                    ad.region = region_num
                    ad.publish_date = timezone.now()    # We're parsing last 24 hours so set publish date to now
                    ad.ad_id = ad_id
                    ad.title = doc.find('h2 a .title').text()
                    ad.link = BASE_URL + doc.find('h2 a')[0].attrib["href"]

                    data = doc.find('.main-data span')

                    raw_data = {}
                    for data_span in data:
                        name = data_span.attrib["class"]
                        value = data_span.text
                        raw_data[name] = value

                    raw_data['posr'] = doc.find('.posr').remove('.new-label').text()

                    raw_attributes = doc.find('.atributi span')
                    for raw_attribute in raw_attributes:
                        name = raw_attribute.text[:raw_attribute.text.find(':')].lower()
                        strong_container = raw_attribute.find("strong")
                        if strong_container is not None:
                            value = raw_attribute.find("strong").text
                            raw_data[name] = value

                    ad.raw_data = json.dumps(raw_data)
                    ad.type, ad.building_type = self.parse_type(raw_data)
                    ad.size_m2 = self.parse_size(raw_data)
                    ad.price, ad.price_m2 = self.parse_price(raw_data, ad.size_m2)

                    ad.year_built = self.parse_year(raw_data)
                    ad.floor = raw_data.get("nadstropje", "")

                    ad.short_description = doc.find('.kratek')[0].text
                    ad.author_name = doc.find('.povezave div')[0].attrib["title"]
                    ad.raw_html = etree.tostring(raw_ad)

                    try:
                        ad.save()
                    except IntegrityError as e:
                        print(e)
                        print("Ad with id %s already exists in database!" % (ad.ad_id, ))

                    _tasks.read_ad_details(ad.pk)

                # Grab next page link
                try:
                    next_page_link = BASE_URL + tree.find('#pagination .next')[0].attrib["href"]
                    parse_queue.put(next_page_link)
                except:
                    pass