def scrape_area(area): """ Scrapes craigslist for a certain geographic area, and finds the latest listings. :param area: :return: A list of results. """ cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SECTION, filters={ 'max_price': settings.MAX_PRICE, "min_price": settings.MIN_PRICE, 'bedrooms': settings.BEDROOMS, 'min_ft2': settings.MIN_FT2, "bathrooms": settings.BATHROOMS }) results = [] gen = cl_h.get_results(sort_by='newest', geotagged=True, limit=20) while True: try: result = next(gen) except StopIteration: break except Exception: continue listing = session.query(Listing).filter_by(cl_id=result["id"]).first() # Don't store the listing if it already exists. if listing is None: if result["where"] is None: # If there is no string identifying which neighborhood the result is from, skip it. continue lat = 0 lon = 0 if result["geotag"] is not None: # Assign the coordinates. lat = result["geotag"][0] lon = result["geotag"][1] # Annotate the result with information about the area it's in and points of interest near it. geo_data = find_points_of_interest(result["geotag"], result["where"]) result.update(geo_data) else: result["area"] = "" result["bart"] = "" # Try parsing the price. price = 0 try: price = float(result["price"].replace("$", "")) except Exception: pass # Create the listing object. listing = Listing(link=result["url"], created=parse(result["datetime"]), lat=lat, lon=lon, name=result["name"], price=price, location=result["where"], cl_id=result["id"], area=result["area"], bart_stop=result["bart"]) # Save the listing so we don't grab it again. session.add(listing) session.commit() # Return the result if it's near a bart station, or if it is in an area we defined. if len(result["bart"]) > 0 or len(result["area"]) > 0: results.append(result) return results
def scrape_area(area): """ Scrapes craigslist for certain geographic area, and finds latest listings :param area: :return: A list of results. """ cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SECTION, filters={"max_price": settings.MAX_PRICE, "min_price": settings.MIN_PRICE}) results = [] gen = cl_h.get_results(sort_by="newest", geotagged=True, limit=20) while True: try: result = next(gen) except StopIteration: break except Exception: continue listing = session.query(Listing).filter_by(cl_id=result["id"]).first() #dont store listing if it already exists if listing is None: if result["where"] is None: #if there is no string identifying which neighborhood result is from skip it continue lat = 0 lon = 0 if result["geotag"] is not None: #assign coordinates lat = result["geotag"][0] lon = result["geotag"][1] #annotate result with info about area its in and points of interest near it geo_data = find_points_of_interest(result["geotag"], result["where"]) result.update(geo_data) else: result["area"] = "" result["ttc"] = "" #try parsing price price = 0 try: price = float(result["price"].replace("$","")) except Exception: pass #create listing object listing = Listing( link=result["url"], created=parse(result["datetime"]), lat=lat, lon=lon, name=result["name"], price=price, location=result["where"], cl_id=result["id"], area=result["area"], ttc_stop=result["ttc"] ) #save listing so we dont grab it again session.add(listing) session.commit() #return result if its near ttc station or if in area defined if len(result["ttc"]) > 0 or len(result["area"]) > 0: results.append(result) return results
def scrape_area(area): """ Scrapes craigslist for a certain geographic area, and finds the latest listings. :param area: :return: A list of results. """ cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SECTION, filters={'max_price': settings.MAX_PRICE, "min_price": settings.MIN_PRICE}) results = [] gen = cl_h.get_results(sort_by='newest', geotagged=True, limit=20) while True: try: result = next(gen) except StopIteration: break except Exception: continue listing = session.query(Listing).filter_by(cl_id=result["id"]).first() # Don't store the listing if it already exists. if listing is None: if result["where"] is None: # If there is no string identifying which neighborhood the result is from, skip it. continue lat = 0 lon = 0 if result["geotag"] is not None: # Assign the coordinates. lat = result["geotag"][0] lon = result["geotag"][1] # Annotate the result with information about the area it's in and points of interest near it. geo_data = find_points_of_interest(result["geotag"], result["where"]) result.update(geo_data) else: result["area"] = "" result["lrt"] = "" # Try parsing the price. price = 0 try: price = float(result["price"].replace("$", "")) except Exception: pass # Create the listing object. listing = Listing( link=result["url"], created=parse(result["datetime"]), lat=lat, lon=lon, name=result["name"], price=price, location=result["where"], cl_id=result["id"], area=result["area"], lrt_stop=result["lrt"] ) # Save the listing so we don't grab it again. session.add(listing) session.commit() # Return the result if it's near a lrt station. if len(result["lrt"]) > 0: results.append(result) return results
def scrape_area(area): """ Scrapes craigslist for a certain geographic area, and finds the latest listings. :param area: :return: A list of results. """ # get the google sheet object sheet = google_sheets.open_sheet() counter = 0 cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SECTION, filters=settings.FILTERS) results = [] gen = cl_h.get_results(sort_by='newest', geotagged=True, limit=1000) while True: try: result = next(gen) except StopIteration: break except Exception: print('exception') continue listing = session.query(Listing).filter_by(cl_id=result["id"]).first() # Don't store the listing if it already exists. if listing is None: # if result["where"] is None: # # If there is no string identifying which neighborhood the result is from, skip it. # continue lat = 0 lon = 0 if result["geotag"] is not None: # Assign the coordinates. lat = result["geotag"][0] lon = result["geotag"][1] # Annotate the result with information about the area it's in and points of interest near it. geo_data = find_points_of_interest(result["geotag"], result["where"]) result.update(geo_data) else: result["area"] = "" result["google_stop"] = "" result["google_dist"] = "" result["fb_stop"] = "" result["fb_dist"] = "" result["fb_walktime"] = "Unknown" result["google_walktime"] = "Unknown" result["adi_drivetime"] = "Unknown" result["address"] = "Unknown" # Try parsing the price. price = 0 try: price = float(result["price"].replace("$", "")) except Exception: pass #include result if within our area or if there is no location infroamtion should_include = False if len(result["area"]) > 0 or lat == 0: should_include = True # Create the listing object. listing = Listing( link=result["url"], created=parse(result["datetime"]), lat=lat, lon=lon, name=result["name"], price=price, location=result["where"], cl_id=result["id"], area=result["area"], # bart_stop=result["bart"], # min_bedrooms=settings.MIN_BEDROOMS, # min_bathrooms=settings.MIN_BATHROOMS, should_include = should_include, # bedrooms = result['bedrooms'], # bathrooms = result['bathrooms'], # sq_ft = result['sq_ft'], # amenities = result['amenities'], # available_date = result['available_date'], ) result_to_return = { 'link': result["url"], 'created':parse(result["datetime"]), 'lat':lat, 'lon':lon, 'name':result["name"], 'price':price, 'location':result["where"], 'cl_id':result["id"], 'tagged_location':result["area"], # 'bart_stop':result["bart"], # 'bart_dist':result["bart_dist"], # 'min_bedrooms':settings.MIN_BEDROOMS, # 'min_bathrooms':settings.MIN_BATHROOMS, 'should_include': should_include, 'available_date': "-", 'bedrooms': 0, 'bedrooms': 0, 'bathrooms': 0, 'amenities': "-", 'sq_ft': 0, 'google_stop': result['google_stop'], 'google_dist': result['google_dist'], 'fb_stop': result['fb_stop'], 'fb_dist': result['fb_dist'], 'fb_walktime': result['fb_walktime'], 'google_walktime': result['google_walktime'], 'adi_drivetime': result['adi_drivetime'], 'address': result['address'] } print("Adding %s..."%result['name']) google_sheets.add_new_record(sheet, result_to_return) # print result_to_return # Save the listing so we don't grab it again. session.add(listing) session.commit() # Return the result if it's near a bart station, or if it is in an area we defined. # if len(result["bart"]) > 0 or len(result["area"]) > 0: # results.append(result) if should_include: results.append(result_to_return) else: print("Skipping %s..."%result['name']) # counter = counter + 1 # # get new credentials # if counter % 50 == 0: # sheet = google_sheets.open_sheet() return results
def scrape_area(area): """ Scrapes craigslist for a certain geographic area, and finds the latest listings. :param area: :return: A list of results. """ filters = {'max_price': settings.MAX_PRICE, 'min_price': settings.MIN_PRICE} if settings.MIN_BEDROOMS: print('filtering by bedrooms:', settings.MIN_BEDROOMS) filters['bedrooms'] = settings.MIN_BEDROOMS if settings.NEIGHBORHOOD_CODE: print('filtering by neighborhood:', settings.NEIGHBORHOOD_CODE) filters['neighborhood_code'] = settings.NEIGHBORHOOD_CODE cl_h = CraigslistHousing( site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SECTION, filters=filters) gmap = GoogleMapsClient() results = [] gen = cl_h.get_results(sort_by='newest', geotagged=True, limit=50) while True: try: result = next(gen) except StopIteration: break except Exception: continue listing = session.query(Listing).filter_by(cl_id=result["id"]).first() # Don't store the listing if it already exists. if listing is None: if result["where"] is None: # If there is no string identifying which neighborhood the result is from, skip it. continue lat = 0 lon = 0 if result["geotag"] is not None: # Assign the coordinates. lat = result["geotag"][0] lon = result["geotag"][1] # Annotate the result with information about the area it's in and points of interest near it. geo_data = find_points_of_interest(result["geotag"], result["where"]) result.update(geo_data) # Find walking distances loc = {'lat': result["geotag"][0], 'lng': result["geotag"][1]} shuttle_min = gmap.min_walking_dist(loc, settings.SHUTTLE_STOPS) if shuttle_min['duration']['value'] > settings.MAX_WALKING_TIME: continue business_min = gmap.min_walking_dist(loc, settings.BUSINESSES) result["bart_dist"] = 'apple: {}, business: {}'.format( gmap.pretty(shuttle_min), gmap.pretty(business_min)) else: result["area"] = "" result["bart"] = "" # Try parsing the price. price = 0 try: price = float(result["price"].replace("$", "")) except Exception: pass # Create the listing object. listing = Listing( link=result["url"], created=parse(result["datetime"]), lat=lat, lon=lon, name=result["name"], price=price, location=result["where"], cl_id=result["id"], area=result["area"], bart_stop=result["bart"] ) # Save the listing so we don't grab it again. session.add(listing) session.commit() # Return the result if it's near a bart station, or if it is in an area we defined. if len(result["bart"]) > 0 or len(result["area"]) > 0: results.append(result) return results
def scrape(site, area, category, min_price, max_price): """ Scrapes craigslist for a certain geographic area, and finds the latest listings. :param site: :param area: :param category: :param min_price: :param max_price: :return: A list of results. """ results = [] cl_h = CraigslistHousing( site=site, area=area, category=category, filters={'min_price': min_price, 'max_price': max_price} ) gen = cl_h.get_results( sort_by='newest', geotagged=True, limit=20 ) while True: try: result = next(gen) except StopIteration: break except Exception: continue listing = session.query(Listing).filter_by(cl_id=result['id']).first() # Don't store the listing if it already exists. if listing is None: if result['where'] is None: # If there is no string identifying which neighborhood the # result is from, skip it. continue # Annotate the result with information about the area it's in and # points of interest near it. result.update( find_points_of_interest(result['geotag'], result['where']) ) lat = 0 lon = 0 if result['geotag'] is not None: # Assign the coordinates. lat = result['geotag'][0] lon = result['geotag'][1] # Try parsing the price. price = 0 try: price = float(result['price'].replace('$', '')) except (TypeError, ValueError): pass # Create the listing object. listing = Listing( link=result['url'], created=parse(result['datetime']), geotag=str(result['geotag']), lat=lat, lon=lon, name=result['name'], price=price, location=result['where'], cl_id=result['id'], neighborhood=result['neighborhood'], transit_stop=result['transit_stop'], shuttle_stop=result['shuttle_stop'] ) # Save the listing so we don't grab it again. session.add(listing) session.commit() # Return the result if it's near a shuttle stop and in a # desired neighborhood. Adjust requirements to your liking. if (result['shuttle_walk_time'] < settings.MAX_SHUTTLE_WALK_TIME and len(result['neighborhood']) > 0 and result['has_image'] and desirable(result['url'])): results.append(result) return results
def scrape_area(area): global loggerOn """ Scrapes craigslist for a certain geographic area, and finds the latest listings. :param area: :return: A list of results. """ cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SECTION, filters={ 'max_price': settings.MAX_PRICE, "min_price": settings.MIN_PRICE }) # adding some logging to see the URLs #cl_h.set_logger(logging.INFO) results = [] gen = cl_h.get_results(sort_by='newest', geotagged=False, limit=35) while True: try: result = next(gen) except StopIteration: break except Exception: continue listing = session.query(Listing).filter_by(cl_id=result["id"]).first() # Don't store the listing if it already exists. if listing is None: # Here we only iterate over listings that we haven't checked in te past. # we get the geotags now instead of getting them before for all listings # because it requires an aditional GET which is unnecessary if we aleardy have # checked the listing if result['has_map']: cl_h.geotag_result(result) if result["where"] is None: # If there is no string identifying which neighborhood the result is from, skip it. continue lat = 0 lon = 0 result["bart_dist"] = 999 result["walkscore"] = 0 result["ws_link"] = "Not found!" if result["geotag"] is not None: # Assign the coordinates. lat = result["geotag"][0] lon = result["geotag"][1] # Annotate the result with information about the area it's in and points of interest near it. geo_data = find_points_of_interest(result["geotag"], result["where"]) result.update(geo_data) result["walkscore"], result['ws_link'] = get_walk_score( result["geotag"]) else: result["area"] = "" result["bart"] = "" # Try parsing the price. price = 0 try: price = float(result["price"].replace("$", "")) except Exception: pass if result["img_url"] is None: result["img_url"] = "" # Create the listing object. listing = Listing(link=result["url"], created=parse(result["datetime"]), lat=lat, lon=lon, name=result["name"], price=price, location=result["where"], cl_id=result["id"], area=result["area"], bart_stop=result["bart"], walkscore=result["walkscore"], ws_link=result["ws_link"], bart_dist=result["bart_dist"], img_url=result["img_url"]) # Save the listing so we don't grab it again. session.add(listing) session.commit() # Return the result if it's near a bart station, or if it is in an area we defined. if len(result["bart"]) > 0 or len(result["area"]) > 0: print result["name"] print result["walkscore"] print result["ws_link"] results.append(result) return results
def scrape_area(area): """ Scrapes craigslist for a certain geographic area, and finds the latest listings. :param area: :return: A list of results. """ cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SECTION, filters={'max_price': settings.MAX_PRICE, 'min_price': settings.MIN_PRICE}) results = [] gen = cl_h.get_results(sort_by='newest', geotagged=True, limit=30) while True: try: result = next(gen) except StopIteration: break except Exception, e: print str(e) continue #listing = session.query(Listing).filter_by(cl_id=result["id"]).first() # Check for listing listing = listing_collections.find_one({"id": result["id"]}) # Don't store the listing if it already exists. if listing is None: if result["where"] is None: # If there is no string identifying which neighborhood the result is from, skip it. continue # Check for multiple bedrooms if result["name"]: _name = result["name"].lower() match = re.search(r'2 bed.*', _name) if not match: continue lat = 0 lon = 0 if result["geotag"] is not None: # Assign the coordinates. lat = result["geotag"][0] lon = result["geotag"][1] # Annotate the result with information about the area it's in and points of interest near it. geo_data = find_points_of_interest(result["geotag"], result["where"]) result.update(geo_data) elif result["where"]: coords_box = get_coordinates(result["where"]) geo_data = find_points_of_interest(coords_box, result["where"]) result.update(geo_data) else: result["area"] = "" # Try parsing the price. price = 0 try: price = float(result["price"].replace("$", "")) except Exception: pass listing = { "link": result["url"], "created": parse(result["datetime"]), "lat": lat, "lon": lon, "name": result["name"], "price": price, "location": result["where"], "cl_id": result["id"], "area": result["area"] } # Insert into the collection... _listing_id = listing_collections.insert_one(listing) # If area is found return the results if len(result["area"]) > 0: results.append(result)