def main(argv): log.info("Starting parser") sleeper = aws.Sleeper(5) # Loop indefinitely, waiting for messages # If a message is available, grab the data to parse out of S3 while True: m = parse_queue.read(visibility_timeout=10) if m is not None: sleeper.reset() message_data = simplejson.loads(m.get_body()) log.info("Processing %s with timestamp %s", message_data["mls"], message_data["date"]) if aws.mls_exists(mls_domain, message_data["mls"], message_data["date"]): log.info("already exists") continue listing_key = bucket.get_key(message_data["key"]) listing_html = listing_key.get_contents_as_string() # Parse it listing = realtylink.Listing(message_data["mls"], listing_html) # TODO: Make this more efficient by using the result from above listing_item = aws.mls_exists(mls_domain, message_data["mls"]) if not listing_item: # And insert it into SimpleDB listing_item = mls_domain.new_item(hash(message_data["mls"])) listing_item["mls"] = listing.mls listing_item["description"] = listing.description[:1023] listing_item["area"] = listing.area listing_item["type"] = listing.type listing_item["bedrooms"] = listing.bedrooms listing_item["bathrooms"] = listing.bathrooms listing_item["age"] = listing.age listing_item["maintenance_fee"] = listing.maintenance_fee listing_item["features"] = listing.features listing_item["address"] = listing.address listing_item["region"] = listing.region listing_item["city"] = listing.city listing_item["unit"] = listing.unit listing_item["last_seen"] = aws.get_iso_timestamp() if "first_seen" not in listing_item: listing_item["first_seen"] = aws.get_iso_timestamp() listing_item.add_value("prices", (listing.price, message_data["date"])) log.debug(listing_item) # Don't save it or delete the message while debugging listing_item.save() parse_queue.delete_message(m) else: log.info("Sleeping") sleeper.sleep()
def main(argv): for key in bucket.list(): mls, datestr = key.key.split("/") date = datestr.split(".")[0] print mls, date if not aws.mls_exists(mls_domain, mls, date): print "Need to parse %s" % mls msg_body = simplejson.dumps({"mls":mls, "key":key.name, "bucket":bucket.name, "date":date}) request_msg = parse_queue.new_message(msg_body) parse_queue.write(request_msg)