def fetch(self):

        for page_num in list(range(1, self.max_n_of_pages_to_fetch)):
            url = self.base_url
            if page_num == 1:
                url += '.html'
            else:
                url = url + self.next_pages + str(page_num) + '.html'
            try:
                resource = urllib.request.urlopen(url)
                print(url)
                html = resource.read().decode('utf-8')
                if re.match("Die Seite konnte leider nicht aufgerufen werden",
                            html, re.IGNORECASE):
                    break

                matches = re.findall(
                    "<div class=\"grid resultlist-container\">.+?<div class=\"grid-item one-whole palm-hide\">",
                    html, re.DOTALL)
                for match in matches:
                    link = 'NA'
                    id_from_immo24 = 'NA'
                    title = 'NA'
                    address = 'NA'
                    price = 'NA'
                    area = 'NA'
                    title_match = re.search(
                        "<div class=\"grid-item one-whole\">\W+<a class=\"resultlist-title\" href=\"(.+)\">(.+)</a>",
                        match)
                    if title_match:
                        link = title_match.group(1)
                        link = re.sub("^//", "", link)
                        id_match = re.search(
                            "www.immobilienscout24.de/expose/(\d+)", link)
                        if id_match:
                            id_from_immo24 = id_match.group(1)
                    if title_match:
                        title = title_match.group(2)
                    address_match = re.search(
                        "<div class=\"resultlist-address\">\W+<a href=\".+\">\W+(.+)\W+</a>",
                        match)
                    if address_match:
                        address = address_match.group(1)
                    price_match = re.search(
                        "<div class=\"grid-item palm-one-third lap-one-third desk-one-third\">\W+<span class=\"resultlist-value\">(.+)</span>",
                        match)
                    if price_match:
                        price = price_match.group(1)
                        price = re.sub(" &euro;", "", price)
                    area_match = re.search(
                        "<div class=\"grid-item palm-one-third lap-one-third desk-one-third properties-padding\">\W+<span class=\"resultlist-value\">\W*(.+?)\W*<span>",
                        match)
                    if area_match:
                        area = area_match.group(1)
                    o = LandOffer(self.get_fetcher_name(), id_from_immo24,
                                  link, title, address, price, area,
                                  time.strftime("%Y-%m-%d"), False)
                    self.results.append(o)
            except HTTPError:
                # On page not found error we exit the for loop. It means we retrieved enough pages for today.
                # This error most likely will show up in other cases, for the time being we break out of the loop.
                break
Beispiel #2
0
 def fetch(self):
     self.results.append(
         LandOffer(self.get_fetcher_name(), "yeruu", "zzxsss", "10/4/2016"))
Beispiel #3
0
 def fetch(self):
     self.results.append(LandOffer(self.get_fetcher_name(), "asdfadfa", "fadsfa", "30/4/2016"))
Beispiel #4
0
def as_offer(dct):
    return LandOffer(dct['source'], dct['id_as_from_source'], dct['link'],
                     dct['title'], dct['location'], dct['price'], dct['area'],
                     dct['date_retrieved'], dct['was_updated'])
Beispiel #5
0
#!/usr/bin/env python
import json
import time
from landprices.offer import LandOffer
from io import StringIO


def as_offer(dct):
    return LandOffer(dct['source'], dct['id_as_from_source'], dct['link'],
                     dct['title'], dct['location'], dct['price'], dct['area'],
                     dct['date_retrieved'], dct['was_updated'])


offers = []
o = LandOffer("source", "id_from_source", "link", "title", "location", "price",
              "area", time.strftime("%Y-%m-%d"))
offers.append(o)
o1 = LandOffer("source1", "id_from_source1", "link1", "title1", "location1",
               "price1", "area1", time.strftime("%Y-%m-%d"))
offers.append(o1)
offers_dict = []
for o in offers:
    offers_dict.append(o.__dict__)
pretty_json_str = json.dumps(offers_dict, sort_keys=True, indent=4)
print(pretty_json_str)
io = StringIO(pretty_json_str)
read_back_offers = json.load(io, object_hook=as_offer)
for o in read_back_offers:
    print(o.source + "\t" + o.id_as_from_source + "\t" + o.link + "\t" +
          o.title + "\t" + o.location + "\t" + o.price + "\t" + o.area + "\t" +
          o.date_retrieved + "\t" + o.was_updated)
 def fetch(self):
     self.results.append(
         LandOffer(self.get_fetcher_name(), "gasgf", "fffff", "29/4/2016"))
from landprices.offer import LandOffer

old_offers = []
new_offers = []
merged_offers = LandOffer.merge_offers(old_offers, new_offers)
assert len(merged_offers) == 0

old = LandOffer("source", "id_from_source", "link", "title", "location", "price", "area", "date_retrieved", False)
old1 = LandOffer("source1", "id_from_source1", "link1", "title1", "location1", "price1", "area1", "date_retrieved1", False)
old2 = LandOffer("source2", "id_from_source2", "link2", "title2", "location2", "price2", "area2", "date_retrieved2", False)
old3 = LandOffer("source3", "id_from_source3", "link3", "title3", "location3", "price3", "area3", "date_retrieved3", False)
old_offers = [old, old1, old2, old3]
merged_offers = LandOffer.merge_offers(old_offers, new_offers)
assert len(merged_offers) == len(old_offers)

new = LandOffer("source", "id_from_source", "link", "title", "location", "price", "area", "date_retrieved", False)
new1 = LandOffer("source1", "id_from_source1", "link1", "title1", "location1", "updated_price1", "area1", "date_retrieved1", False)
new4 = LandOffer("source4", "id_from_source4", "link4", "title4", "location4", "price4", "area4", "date_retrieved4", False)
new_offers = [new, new1, new4]
merged_offers = LandOffer.merge_offers(old_offers, new_offers)
assert new1 in merged_offers
assert new4 in merged_offers
assert old1 not in merged_offers
for offer in merged_offers:
    if offer.source == new1.source and offer.id_as_from_source == new1.id_as_from_source:
        assert offer.was_updated is True
assert len(merged_offers) == len(old_offers) + 1
for o in merged_offers:
    print(o.to_tabbed_string())