class Yad2Crawler(object): def __init__(self): self.log = Log() self.client = Yad2Client() self.notifier = settings.notifier(**settings.notifier_settings) self.db = ApartmentDatabase('yad2.db') self.client.add_cookie('PRID', 'xx') self.apartment_type = ['Private'] if not settings.crawl_filter['noRealEstate']: self.apartment_type.append('Trade') def get_prid(self, html): hf = Dispatch('htmlfile') hf.writeln( html + "\n<script>document.write(\"<meta name='PRID' content='\" +genPid()+ \"'>\")</script>" ) prid = next( ifilter(lambda m: m.name == 'PRID', hf.getElementsByTagName('meta')), None) return prid.content if prid else None def get_prlst(self, html): prlst = '' for m in findall(r'String.fromCharCode\(\d*\)', html): match = search('\d+', m) if match: prlst += chr(int(match.group())) return prlst def get_page(self, page=1): url = "http://m.yad2.co.il/API/MadorResults.php" def transform_web_to_web_values(val): if type(val) == type(True): return 1 if val else 0 return val fix_parameters = { k: transform_web_to_web_values(v) for k, v in settings.crawl_parameters.items() if v is not None } fix_parameters.update({"Page": page}) return self.client.get_url(url, args=fix_parameters) def crawl_apartments(self, json): for apr in json: if apr['Type'] != 'Ad': continue if not all([ val in apr.keys() for val in [ 'latitude', 'longitude', 'RecordID', 'URL', 'img', 'Line1', 'Line2', 'Line3', 'Line4' ] ]): continue latitude = apr['latitude'] longitude = apr['longitude'] record_id = apr['RecordID'] url = apr['URL'] img = apr['img'] address = apr['Line1'] description = apr['Line2'] price = apr['Line3'] date = apr['Line4'] self.log.debug(".. Checking %s", record_id) if settings.crawl_filter[ 'onlyWithPhoto'] and "missingAdPic.jpg" in img: self.log.debug(".. Filtering for missing img") continue area = next( ifilter( lambda (lat, lon, r, name): haversine_distance( (latitude, longitude), (lat, lon)) <= r, settings.LOCATIONS), None) if not area: self.log.debug(".. Filtering for no matching area") continue area_name = area[3] if (datetime.now() - datetime.strptime( date, "%d-%m-%Y")).days > settings.crawl_filter['maxAge']: self.log.debug(".. Filtering for old update date") continue if self.db.id_exists(record_id): self.log.debug(".. Already exists in database") self.db.update_last_seen(record_id) continue self.log.info(".. Found new match %s at %s", record_id, area_name) self.notify_apartment(url, address, area_name) self.db.add_new(record_id, area_name, address, description, price, url) self.log.debug(".. Added to database") self.log.debug(".. OK") def notify_apartment(self, url, description, area): self.log.debug(".. Sending notification") data = self.get_apartment_page(url) self.notifier.send_notification(url, description, area, data) def get_apartment_page(self, url): errors = True prid = None self.log.debug(".. Getting page %s", url) while errors: html = self.client.get_url(url) if "Please activate javascript to view this site" in html: self.log.debug(".... Using IE to calculate PRID") prid = self.get_prid(html) elif "bot, spider, crawler" in html: self.log.debug(".... Clearing cookies") self.client.clear_cookies() else: prlst = self.get_prlst(html) prid = prlst if prlst != '' else None errors = False if prid: self.log.debug(".... Setting PRID=%s", prid) self.client.add_cookie('PRID', prid) return self.create_apartment_body(html, url) def create_apartment_body(self, html, url): pp = PageParser(html) return pp.create_apartment_page(url) def crawl(self): while True: try: self.log.info("Starting scan") page = 1 more = True while more: self.log.info("Requesting page #%d", page) data = self.get_page(page) json = loads(data) for type in filter( lambda x: x in json and 'Results' in json[x], self.apartment_type): self.log.info(".. Checking %s apartments...", type) self.crawl_apartments(json[type]['Results']) more = True if json['MoreResults'] == 1 else False page += 1 self.log.info("Scan ended, going to sleep (%d min)", settings.ITERATION_SLEEP_SEC / 60) sleep(settings.ITERATION_SLEEP_SEC) except RuntimeError as e: self.log.error(e) break except Exception as e: self.log.error(e) self.notifier.finalize()
class Yad2Crawler(object): def __init__(self): self.log = Log() self.client = Yad2Client() self.notifier = MailNotifier(GMAIL_USER, GMAIL_PASS) self.db = ApartmentDatabase('yad2.db') self.client.add_cookie('PRID', 'xx') def get_prid(self, html): hf = Dispatch('htmlfile') hf.writeln(html + "\n<script>document.write(\"<meta name='PRID' content='\" +genPid()+ \"'>\")</script>") prid = next(ifilter(lambda m: m.name == 'PRID', hf.getElementsByTagName('meta')), None) return prid.content if prid else None def get_prlst(self, html): prlst = '' for m in findall(r'String.fromCharCode\(\d*\)', html): match = search('\d+', m) if match: prlst += chr(int(match.group())) return prlst def get_page(self, page = 1): url = "http://m.yad2.co.il/API/MadorResults.php" args = { "CatID": CAT_ID, "SubCatID": SUB_CAT_ID, "AreaID": AREA_ID, "PriceType": 0, "Page": page, } if 'HOME_TYPE_ID' in globals(): args['HomeTypeID'] = HOME_TYPE_ID if 'FROM_ROOMS' in globals(): args['FromRooms'] = FROM_ROOMS if 'TO_ROOMS' in globals(): args['ToRooms'] = TO_ROOMS if 'FROM_PRICE' in globals(): args['FromPrice'] = FROM_PRICE if 'TO_PRICE' in globals(): args['ToPrice'] = TO_PRICE if 'ONLY_PETS_ALLOWED' in globals(): args['PetsInHouse'] = 1 if ONLY_PETS_ALLOWED else 0 if 'ONLY_WITH_PARKING' in globals(): args['Parking'] = 1 if ONLY_WITH_PARKING else 0 return self.client.get_url(url, args = args) def crawl_apartments(self, json): for apr in json: if apr['Type'] != 'Ad': continue if not all([val in apr.keys() for val in ['latitude', 'longitude', 'RecordID', 'URL', 'img', 'Line1', 'Line2', 'Line3', 'Line4']]): continue latitude = apr['latitude'] longitude = apr['longitude'] record_id = apr['RecordID'] url = apr['URL'] img = apr['img'] address = apr['Line1'] description = apr['Line2'] price = apr['Line3'] date = apr['Line4'] self.log.debug(".. Checking %s", record_id) if 'ONLY_WITH_PHOTO' in globals() and ONLY_WITH_PHOTO and "missingAdPic.jpg" in img: self.log.debug(".. Filtering for missing img") continue area = next(ifilter(lambda (lat, lon, r, name): haversine_distance((latitude, longitude), (lat, lon)) <= r, LOCATIONS), None) if not area: self.log.debug(".. Filtering for no matching area") continue area_name = area[3] if (datetime.now() - datetime.strptime(date, "%d-%m-%Y")).days > MAX_AGE_DAYS: self.log.debug(".. Filtering for old update date") continue if self.db.id_exists(record_id): self.log.debug(".. Already exists in database") self.db.update_last_seen(record_id) continue self.log.info(".. Found new match %s at %s", record_id, area_name) self.notify_apartment(url, address, area_name) self.db.add_new(record_id, area_name, address, description, price, url) self.log.debug(".. Added to database") self.log.debug(".. OK") def notify_apartment(self, url, description, area): data = self.get_apartment_page(url) self.log.debug(".. Sending notification") subject = NOTIFICATION_SUBJECT subject = subject.replace("%URL%", url) subject = subject.replace("%DESCRIPTION%", description) subject = subject.replace("%AREA%", area) self.notifier.send_notification(NOTIFICATION_RECIPIENT, subject, data) def get_apartment_page(self, url): errors = True prid = None self.log.debug(".. Getting page %s", url) while errors: html = self.client.get_url(url) if "Please activate javascript to view this site" in html: self.log.debug(".... Using IE to calculate PRID") prid = self.get_prid(html) elif "bot, spider, crawler" in html: self.log.debug(".... Clearing cookies") self.client.clear_cookies() else: prlst = self.get_prlst(html) prid = prlst if prlst != '' else None errors = False if prid: self.log.debug(".... Setting PRID=%s", prid) self.client.add_cookie('PRID', prid) return self.create_apartment_body(html, url) def create_apartment_body(self, html, url): pp = PageParser(html) return pp.create_apartment_page(url) def crawl(self): while True: try: self.log.info("Starting scan") page = 1 more = True while more: self.log.info("Requesting page #%d", page) data = self.get_page(page) json = loads(data) if 'Private' in json and 'Results' in json['Private']: self.log.info(".. Checking private apartments...") self.crawl_apartments(json['Private']['Results']) if (not 'ONLY_PRIVATE' in globals() or not ONLY_PRIVATE) and 'Trade' in json and 'Results' in json['Trade']: self.log.info(".. Checking trade apartments...") self.crawl_apartments(json['Trade']['Results']) more = True if json['MoreResults'] == 1 else False page += 1 self.log.info("Scan ended, going to sleep (%d min)", ITERATION_SLEEP_SEC / 60) sleep(ITERATION_SLEEP_SEC) except RuntimeError as e: self.log.error(e) break except Exception as e: self.log.error(e)
class Yad2Crawler(object): def __init__(self): self.log = Log() self.client = Yad2Client() self.notifier = MailNotifier(GMAIL_USER, GMAIL_PASS) self.db = ApartmentDatabase('yad2.db') self.client.add_cookie('PRID', 'xx') def get_prid(self, html): hf = Dispatch('htmlfile') hf.writeln( html + "\n<script>document.write(\"<meta name='PRID' content='\" +genPid()+ \"'>\")</script>" ) prid = next( ifilter(lambda m: m.name == 'PRID', hf.getElementsByTagName('meta')), None) return prid.content if prid else None def get_prlst(self, html): prlst = '' for m in findall(r'String.fromCharCode\(\d*\)', html): match = search('\d+', m) if match: prlst += chr(int(match.group())) return prlst def get_page(self, page=1): url = "http://m.yad2.co.il/API/MadorResults.php" args = { "CatID": CAT_ID, "SubCatID": SUB_CAT_ID, "AreaID": AREA_ID, "PriceType": 0, "Page": page, } if 'HOME_TYPE_ID' in globals(): args['HomeTypeID'] = HOME_TYPE_ID if 'FROM_ROOMS' in globals(): args['FromRooms'] = FROM_ROOMS if 'TO_ROOMS' in globals(): args['ToRooms'] = TO_ROOMS if 'FROM_PRICE' in globals(): args['FromPrice'] = FROM_PRICE if 'TO_PRICE' in globals(): args['ToPrice'] = TO_PRICE if 'ONLY_PETS_ALLOWED' in globals(): args['PetsInHouse'] = 1 if ONLY_PETS_ALLOWED else 0 if 'ONLY_WITH_PARKING' in globals(): args['Parking'] = 1 if ONLY_WITH_PARKING else 0 return self.client.get_url(url, args=args) def crawl_apartments(self, json): for apr in json: if apr['Type'] != 'Ad': continue if not all([ val in apr.keys() for val in [ 'latitude', 'longitude', 'RecordID', 'URL', 'img', 'Line1', 'Line2', 'Line3', 'Line4' ] ]): continue latitude = apr['latitude'] longitude = apr['longitude'] record_id = apr['RecordID'] url = apr['URL'] img = apr['img'] address = apr['Line1'] description = apr['Line2'] price = apr['Line3'] date = apr['Line4'] self.log.debug(".. Checking %s", record_id) if 'ONLY_WITH_PHOTO' in globals( ) and ONLY_WITH_PHOTO and "missingAdPic.jpg" in img: self.log.debug(".. Filtering for missing img") continue area = next( ifilter( lambda (lat, lon, r, name): haversine_distance( (latitude, longitude), (lat, lon)) <= r, LOCATIONS), None) if not area: self.log.debug(".. Filtering for no matching area") continue area_name = area[3] if (datetime.now() - datetime.strptime(date, "%d-%m-%Y")).days > MAX_AGE_DAYS: self.log.debug(".. Filtering for old update date") continue if self.db.id_exists(record_id): self.log.debug(".. Already exists in database") self.db.update_last_seen(record_id) continue self.log.info(".. Found new match %s at %s", record_id, area_name) self.notify_apartment(url, address, area_name) self.db.add_new(record_id, area_name, address, description, price, url) self.log.debug(".. Added to database") self.log.debug(".. OK") def notify_apartment(self, url, description, area): data = self.get_apartment_page(url) self.log.debug(".. Sending notification") subject = NOTIFICATION_SUBJECT subject = subject.replace("%URL%", url) subject = subject.replace("%DESCRIPTION%", description) subject = subject.replace("%AREA%", area) self.notifier.send_notification(NOTIFICATION_RECIPIENT, subject, data) def get_apartment_page(self, url): errors = True prid = None self.log.debug(".. Getting page %s", url) while errors: html = self.client.get_url(url) if "Please activate javascript to view this site" in html: self.log.debug(".... Using IE to calculate PRID") prid = self.get_prid(html) elif "bot, spider, crawler" in html: self.log.debug(".... Clearing cookies") self.client.clear_cookies() else: prlst = self.get_prlst(html) prid = prlst if prlst != '' else None errors = False if prid: self.log.debug(".... Setting PRID=%s", prid) self.client.add_cookie('PRID', prid) return self.create_apartment_body(html, url) def create_apartment_body(self, html, url): pp = PageParser(html) return pp.create_apartment_page(url) def crawl(self): while True: try: self.log.info("Starting scan") page = 1 more = True while more: self.log.info("Requesting page #%d", page) data = self.get_page(page) json = loads(data) if 'Private' in json and 'Results' in json['Private']: self.log.info(".. Checking private apartments...") self.crawl_apartments(json['Private']['Results']) if (not 'ONLY_PRIVATE' in globals() or not ONLY_PRIVATE ) and 'Trade' in json and 'Results' in json['Trade']: self.log.info(".. Checking trade apartments...") self.crawl_apartments(json['Trade']['Results']) more = True if json['MoreResults'] == 1 else False page += 1 self.log.info("Scan ended, going to sleep (%d min)", ITERATION_SLEEP_SEC / 60) sleep(ITERATION_SLEEP_SEC) except RuntimeError as e: self.log.error(e) break except Exception as e: self.log.error(e)