def fetch(self): for page_num in list(range(1, self.max_n_of_pages_to_fetch)): url = self.base_url if page_num == 1: url += '.html' else: url = url + self.next_pages + str(page_num) + '.html' try: resource = urllib.request.urlopen(url) print(url) html = resource.read().decode('utf-8') if re.match("Die Seite konnte leider nicht aufgerufen werden", html, re.IGNORECASE): break matches = re.findall( "<div class=\"grid resultlist-container\">.+?<div class=\"grid-item one-whole palm-hide\">", html, re.DOTALL) for match in matches: link = 'NA' id_from_immo24 = 'NA' title = 'NA' address = 'NA' price = 'NA' area = 'NA' title_match = re.search( "<div class=\"grid-item one-whole\">\W+<a class=\"resultlist-title\" href=\"(.+)\">(.+)</a>", match) if title_match: link = title_match.group(1) link = re.sub("^//", "", link) id_match = re.search( "www.immobilienscout24.de/expose/(\d+)", link) if id_match: id_from_immo24 = id_match.group(1) if title_match: title = title_match.group(2) address_match = re.search( "<div class=\"resultlist-address\">\W+<a href=\".+\">\W+(.+)\W+</a>", match) if address_match: address = address_match.group(1) price_match = re.search( "<div class=\"grid-item palm-one-third lap-one-third desk-one-third\">\W+<span class=\"resultlist-value\">(.+)</span>", match) if price_match: price = price_match.group(1) price = re.sub(" €", "", price) area_match = re.search( "<div class=\"grid-item palm-one-third lap-one-third desk-one-third properties-padding\">\W+<span class=\"resultlist-value\">\W*(.+?)\W*<span>", match) if area_match: area = area_match.group(1) o = LandOffer(self.get_fetcher_name(), id_from_immo24, link, title, address, price, area, time.strftime("%Y-%m-%d"), False) self.results.append(o) except HTTPError: # On page not found error we exit the for loop. It means we retrieved enough pages for today. # This error most likely will show up in other cases, for the time being we break out of the loop. break
def fetch(self): self.results.append( LandOffer(self.get_fetcher_name(), "yeruu", "zzxsss", "10/4/2016"))
def fetch(self): self.results.append(LandOffer(self.get_fetcher_name(), "asdfadfa", "fadsfa", "30/4/2016"))
def as_offer(dct): return LandOffer(dct['source'], dct['id_as_from_source'], dct['link'], dct['title'], dct['location'], dct['price'], dct['area'], dct['date_retrieved'], dct['was_updated'])
#!/usr/bin/env python import json import time from landprices.offer import LandOffer from io import StringIO def as_offer(dct): return LandOffer(dct['source'], dct['id_as_from_source'], dct['link'], dct['title'], dct['location'], dct['price'], dct['area'], dct['date_retrieved'], dct['was_updated']) offers = [] o = LandOffer("source", "id_from_source", "link", "title", "location", "price", "area", time.strftime("%Y-%m-%d")) offers.append(o) o1 = LandOffer("source1", "id_from_source1", "link1", "title1", "location1", "price1", "area1", time.strftime("%Y-%m-%d")) offers.append(o1) offers_dict = [] for o in offers: offers_dict.append(o.__dict__) pretty_json_str = json.dumps(offers_dict, sort_keys=True, indent=4) print(pretty_json_str) io = StringIO(pretty_json_str) read_back_offers = json.load(io, object_hook=as_offer) for o in read_back_offers: print(o.source + "\t" + o.id_as_from_source + "\t" + o.link + "\t" + o.title + "\t" + o.location + "\t" + o.price + "\t" + o.area + "\t" + o.date_retrieved + "\t" + o.was_updated)
def fetch(self): self.results.append( LandOffer(self.get_fetcher_name(), "gasgf", "fffff", "29/4/2016"))
from landprices.offer import LandOffer old_offers = [] new_offers = [] merged_offers = LandOffer.merge_offers(old_offers, new_offers) assert len(merged_offers) == 0 old = LandOffer("source", "id_from_source", "link", "title", "location", "price", "area", "date_retrieved", False) old1 = LandOffer("source1", "id_from_source1", "link1", "title1", "location1", "price1", "area1", "date_retrieved1", False) old2 = LandOffer("source2", "id_from_source2", "link2", "title2", "location2", "price2", "area2", "date_retrieved2", False) old3 = LandOffer("source3", "id_from_source3", "link3", "title3", "location3", "price3", "area3", "date_retrieved3", False) old_offers = [old, old1, old2, old3] merged_offers = LandOffer.merge_offers(old_offers, new_offers) assert len(merged_offers) == len(old_offers) new = LandOffer("source", "id_from_source", "link", "title", "location", "price", "area", "date_retrieved", False) new1 = LandOffer("source1", "id_from_source1", "link1", "title1", "location1", "updated_price1", "area1", "date_retrieved1", False) new4 = LandOffer("source4", "id_from_source4", "link4", "title4", "location4", "price4", "area4", "date_retrieved4", False) new_offers = [new, new1, new4] merged_offers = LandOffer.merge_offers(old_offers, new_offers) assert new1 in merged_offers assert new4 in merged_offers assert old1 not in merged_offers for offer in merged_offers: if offer.source == new1.source and offer.id_as_from_source == new1.id_as_from_source: assert offer.was_updated is True assert len(merged_offers) == len(old_offers) + 1 for o in merged_offers: print(o.to_tabbed_string())