def main(argv): add_count = 0 for city_name, city_id in realtylink.cities.items(): for region in realtylink.regions[city_name]: for property_type in (realtylink.TOWNHOUSE, realtylink.APARTMENT, realtylink.HOUSE): # for region in (22,): # for property_type in (2,): log.info("Searching %s - %s for %s" % (city_name, region, property_type)) results = realtylink.search(property_type=property_type, city=city_id, areas=[region]) for mls, price in results: normalized_price = pad_price(realtylink.fix_price(price)) update, result = needs_update(mls, price) if update: log.info("Queuing %s" % mls) m = mls_queue.new_message(mls) mls_queue.write(m) add_count += 1 else: result["last_seen"] = aws.get_iso_timestamp() if "first_seen" not in result: result["first_seen"] = aws.get_iso_timestamp() result.save() time.sleep(15) log.info("Added %s entries to the parse queue" % add_count)
def main(argv): log.info("Starting parser") sleeper = aws.Sleeper(5) # Loop indefinitely, waiting for messages # If a message is available, grab the data to parse out of S3 while True: m = parse_queue.read(visibility_timeout=10) if m is not None: sleeper.reset() message_data = simplejson.loads(m.get_body()) log.info("Processing %s with timestamp %s", message_data["mls"], message_data["date"]) if aws.mls_exists(mls_domain, message_data["mls"], message_data["date"]): log.info("already exists") continue listing_key = bucket.get_key(message_data["key"]) listing_html = listing_key.get_contents_as_string() # Parse it listing = realtylink.Listing(message_data["mls"], listing_html) # TODO: Make this more efficient by using the result from above listing_item = aws.mls_exists(mls_domain, message_data["mls"]) if not listing_item: # And insert it into SimpleDB listing_item = mls_domain.new_item(hash(message_data["mls"])) listing_item["mls"] = listing.mls listing_item["description"] = listing.description[:1023] listing_item["area"] = listing.area listing_item["type"] = listing.type listing_item["bedrooms"] = listing.bedrooms listing_item["bathrooms"] = listing.bathrooms listing_item["age"] = listing.age listing_item["maintenance_fee"] = listing.maintenance_fee listing_item["features"] = listing.features listing_item["address"] = listing.address listing_item["region"] = listing.region listing_item["city"] = listing.city listing_item["unit"] = listing.unit listing_item["last_seen"] = aws.get_iso_timestamp() if "first_seen" not in listing_item: listing_item["first_seen"] = aws.get_iso_timestamp() listing_item.add_value("prices", (listing.price, message_data["date"])) log.debug(listing_item) # Don't save it or delete the message while debugging listing_item.save() parse_queue.delete_message(m) else: log.info("Sleeping") sleeper.sleep()
def get_key(mls): timestamp = aws.get_iso_timestamp() return "%s/%s.html" % (mls, timestamp)