Esempio n. 1
0
def main(argv):
    add_count = 0
    
    for city_name, city_id in realtylink.cities.items():
        for region in realtylink.regions[city_name]:
            for property_type in (realtylink.TOWNHOUSE, realtylink.APARTMENT, realtylink.HOUSE):
        # for region in (22,):
        #     for property_type in (2,):        
                log.info("Searching %s - %s for %s" % (city_name, region, property_type))
                
                results = realtylink.search(property_type=property_type, 
                                  city=city_id, 
                                  areas=[region])
                for mls, price in results:
                    normalized_price = pad_price(realtylink.fix_price(price))
                    update, result = needs_update(mls, price)
                    if update:
                        log.info("Queuing %s" % mls)
                        m = mls_queue.new_message(mls)
                        mls_queue.write(m)
                        add_count += 1
                    else:
                        result["last_seen"] = aws.get_iso_timestamp()
                        if "first_seen" not in result:
                            result["first_seen"] = aws.get_iso_timestamp()
                        result.save()
                time.sleep(15)
    
    log.info("Added %s entries to the parse queue" % add_count)
Esempio n. 2
0
def main(argv):
    log.info("Starting parser")
    sleeper = aws.Sleeper(5)
    # Loop indefinitely, waiting for messages
    # If a message is available, grab the data to parse out of S3
    while True:
        m = parse_queue.read(visibility_timeout=10)
        if m is not None:
            sleeper.reset()
            message_data = simplejson.loads(m.get_body())
                
            log.info("Processing %s with timestamp %s", message_data["mls"], message_data["date"])
            if aws.mls_exists(mls_domain, message_data["mls"], message_data["date"]):
                log.info("already exists")
                continue
            listing_key = bucket.get_key(message_data["key"])
            listing_html = listing_key.get_contents_as_string()
            # Parse it
            listing = realtylink.Listing(message_data["mls"], listing_html)
            
            # TODO: Make this more efficient by using the result from above
            listing_item = aws.mls_exists(mls_domain, message_data["mls"])
            if not listing_item:
                # And insert it into SimpleDB
                listing_item = mls_domain.new_item(hash(message_data["mls"]))
                listing_item["mls"] = listing.mls
                listing_item["description"] = listing.description[:1023]
                listing_item["area"] = listing.area
                listing_item["type"] = listing.type
                listing_item["bedrooms"] = listing.bedrooms
                listing_item["bathrooms"] = listing.bathrooms
                listing_item["age"] = listing.age
                listing_item["maintenance_fee"] = listing.maintenance_fee
                listing_item["features"] = listing.features
                listing_item["address"] = listing.address
                listing_item["region"] = listing.region
                listing_item["city"] = listing.city
                listing_item["unit"] = listing.unit
                listing_item["last_seen"] = aws.get_iso_timestamp()
                if "first_seen" not in listing_item:
                    listing_item["first_seen"] = aws.get_iso_timestamp()
            listing_item.add_value("prices", (listing.price, message_data["date"]))
            log.debug(listing_item)
            # Don't save it or delete the message while debugging
            listing_item.save()
            
            parse_queue.delete_message(m)
        else:
            log.info("Sleeping")
            sleeper.sleep()
Esempio n. 3
0
def get_key(mls):
    timestamp = aws.get_iso_timestamp()
    return "%s/%s.html" % (mls, timestamp)