def process_craigslist(): """Pull data from craigslist.""" from craigslist import CraigslistHousing cl_housing = CraigslistHousing(site=SearchConfig.SITE, area=SearchConfig.AREA, category=SearchConfig.CATEGORY, filters=SearchConfig.FILTERS) results = cl_housing.get_results(sort_by='newest', geotagged=True, limit=20) valid_results = {} for result in results: geotag = result["geotag"] for location, coords in SearchConfig.BOUNDING_AREAS.items(): # make sure theres a list to append to if location not in valid_results.keys(): valid_results[location] = [] if geotag and in_box(geotag, coords): valid_results[location].append(result) break with open(SearchConfig.OUTFILE, 'w') as outfile: json.dump(valid_results, outfile, indent=2)
def main(): # get the data from Craigslist housing = CraigslistHousing(site='sfbay', area='sfc', category='apa', filters={ 'posted_today': True, 'min_price': settings.min_price, 'max_price': settings.max_price, 'min_bedrooms': settings.min_bedrooms }) log.info('Retrieving listings') for result in housing.get_results(sort_by='newest', geotagged=True): # result = {'id': '6902060582', 'repost_of': None, 'name': 'Spacious one bedroom apartment near USF& GG PK', 'url': 'https://sfbay.craigslist.org/sfc/apa/d/san-francisco-spacious-one-bedroom/6902060582.html', 'datetime': '2019-05-31 21:44', 'price': '$2950', 'where': 'inner richmond', 'has_image': True, 'has_map': True, 'geotag': (37.775905, -122.458591), 'bedrooms': '1', 'area': None} # create a `listing` dict with the fields I care about and process the result listing = {} listing['craigslist_id'] = result['id'] listing['craigslist_url'] = result['url'] listing['posted_on'] = result['datetime'] listing['description'] = result['name'] listing['price'] = int( result['price'][1:] ) # price always has a leading '$' so we need to strip the leading character listing['neighborhood'] = str.lower( result['where'] ) if result['where'] else '' # sometimes this is null listing['num_bedrooms'] = result['bedrooms'] listing['sqft'] = result['area'] listing['latitude'] = result['geotag'][0] listing['longitude'] = result['geotag'][1] # decide if we want to notify about this listing # https://stackoverflow.com/questions/2783969/compare-string-with-all-values-in-array if any(x in listing['neighborhood'] for x in settings.neighborhood_blacklist): notify = False else: notify = True # check if the listing is a duplicate if database.get_record(listing['craigslist_id']): log.info('Found duplicate record with ID {}, skipping'.format( listing['craigslist_id'])) continue # if duplicate we assume we've procsessed this listing so just skip it # otherwise we should save the listing and notify if applicable else: log.info('{} looks like a new listing, processing'.format( 'craigslist_id')) # get the map image from Mapbox # we do this here instead of above to limit the number of API requests made to Mapbox listing['map_image'] = get_map(listing['latitude'], listing['longitude']) database.insert_record(listing) if notify is True: send_notification(listing) database.mark_as_notified(listing['craigslist_id'])
def scrape_area(area): """ Scrapes craigslist for newest listings in area :param area: :return: A list of results. """ cl = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SUBSECTION, filters={'max_price': settings.MAX_PRICE, "min_price": settings.MIN_PRICE}) results = [] some = cl.get_results(sort_by='newest', geotagged=True, limit=50) while True: try: result = next(some) except StopIteration: break except Exception: continue if result["geotag"] is None: continue # Parse the price. price = 0 try: price = float(result["price"].replace("$", "")) except Exception: pass if check_commutes(result) : results.append(result) return results
def housing(citi_code, category_code): cl_h = CraigslistHousing(site=citi_code, category=category_code, filters={'posted_today': True}) for result in cl_h.get_results(sort_by='newest', geotagged=True): id = str(result["id"]).replace(",", "") name = str(result["name"]).replace(",", "") url = str(result["url"]).replace(",", "") date_time = str(result["datetime"]).replace(",", "") last_update = str(result["last_updated"]).replace(",", "") price = str(result["price"]).replace(",", "") location = str(result["where"]).replace(",", "") geolocation = str(result["geotag"]).replace(",", " and ") asd = requests.get(result["url"]) time.sleep(2) soup = BeautifulSoup(asd.text, "html.parser") bsd = soup.find('section', {'id': 'postingbody'}) discription = bsd.text.replace("\n", " ").replace(",", "").strip(" ").strip( "QR Code Link to This Post ") phone_number = re.findall("(\d{3}[-\.\s]\d{3}[-\.\s]\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]\d{4}|\d{3}[-\.\s]\d{4})", discription) PhoneNumber = '' for phone in phone_number: PhoneNumber += phone + "/" emails = re.findall("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", discription) Emails = '' for email in emails: Emails += email + "/" to_write = id + "," + name + "," +PhoneNumber.replace(",","")+ "," +Emails.replace(",","")+ "," + discription + "," + url + "," + date_time + "," + last_update + "," + price + "," + location + "," + geolocation + "\n" write_to_file.write(to_write) print(to_write) write_to_file.close()
def collect_clist_data(): '''docstring for collect_clist_data''' cl_h = CraigslistHousing(site='sfbay', area='sfc', filters={ 'min_price': 1000, 'max_price': 6000, 'search_distance': 4, 'zip_code': 94115, 'posted_today': True }) i = 0 dfs = [] logger.info('parsing results') for result in cl_h.get_results(sort_by='newest', geotagged=True, include_details=True): if i % 50 == 0: logger.info('get results for row ' + str(i)) temp = pd.DataFrame(list(result.items())).T cols = list(temp.iloc[0]) temp.columns = cols temp = temp.iloc[-1] temp = pd.DataFrame(temp).T dfs.append(temp) i = i + 1 logger.info(str(i + 1) + ' listings collected') df = pd.concat(dfs, sort=False) df['script_timestamp'] = dt.datetime.now() ndf = search_cl.clean_clist_df(df) return ndf
def scrape_craigslist(max_price=10000, min_price=1000, limit=None, site='sfbay', area='sfc'): """Returns a list of craigslist postings that were posted in the past day filtering for the given Craigslist area. Available areas in the Bay Area include 'sfc' (the city), 'sby' (South Bay), 'eby' (East Bay), 'pen' (Peninsula), 'nby' (North Bay), 'scz' (Santa Cruz)""" cl = CraigslistHousing(site=site, area=area, category='apa', filters={ 'max_price': max_price, 'min_price': min_price, 'private_room': True, 'posted_today': True }) listings = [] for result in cl.get_results(sort_by='newest', geotagged=True, limit=limit): bedrooms = int( result['bedrooms']) if result['bedrooms'] is not None else 0 location = result['bedrooms'] # filter for only studios or 1 bedrooms or 2 bedrooms if bedrooms > 2: continue listings.append(ApartmentListing.from_dict(result)) return listings
def lambda_handler(event, context): # Pull in environmental variable for number of posts to pull number_of_posts = os.environ.get("number_of_posts") # Instantiate our Craigslist scraper cl = CraigslistHousing(site='newyork', area=None, category='aap') # Pull data from Craigslist and put into a list results = cl.get_results(sort_by='newest', geotagged=True, limit=5) resultsList = [result for result in results] # Convert data to json data = json.dumps(resultsList) # Get the current datetime for the file name now = str(datetime.today()) # Export the data to S3 client = boto3.client('s3') response = client.put_object(Bucket='lazyapartment', Body=data, Key='rawdata/{}.json'.format(now))
def ScrapeNYC(area, limit, directory): os.chdir(directory) cl = CraigslistHousing(site='newyork', area=area) gen = cl.get_results(sort_by='newest', geotagged=True, limit=limit) t = [] while True: try: result = next(gen) except StopIteration: break except Exception: continue t.append(result) df = pd.DataFrame(t) df_NAN = df.dropna(how='any') col_list = ['id', 'datetime', 'geotag', 'price'] df_NAN = df_NAN[col_list] date = time.strftime("%m_%d_%Y") if os.path.isfile(date + '.csv'): df_NAN.to_csv(date + '.csv', mode='a', header=False, index=False) else: df_NAN.to_csv(date + '.csv', index=False)
def search_and_write_to_csv(): # for loop taking in the defined zip codes, if not set, use a default list # default list could be the zip codes of each underground station # west to east starting Dundas West green_line = { "DUNDAS WEST": "M6P 1W7", "DUFFERIN": "M6H 4E6", "CHRISTIE": "M6G 3B1", "BAY": "M5R 3N7" } # north east to south to north west yellow_line = { "LAWRENCE": "M4N 1S1", "EGLINTON": "M4S 2B8", "DAVISVILLE": "M4S 1Z2", "ST CLAIR": "M4T 1J8", "SUMMERHILL": "M4T 1W2", "ROSEDALE": "M4W 1T1", "BLOOR-YONGE": "M4W 1A8", "WELLESLEY": "M4Y 1G3", "COLLEGE": "M5B 1L2", "DUNDAS": "M5G 1Z3", "QUEEN": "M5C 2X9", "KING": "M5H 1A1", "UNION": "M5J 1E6", "ST ANDREW": "M5H 3T4", "OSGOODE": "M5H 3E5", "ST PATRICK": "M5G 1V1", "QUEENS PARK": "M5G 1X7", "MUSEUM": "M5S 2C5", "ST GEORGE": "M5R 2L8", "SPADINA": "M5R 2T6", "DUPONT": "M5R 1V7", "ST CLAIR WEST": "M5P 3N3" } lines = [green_line, yellow_line] for line in lines: print("Processing", line) for station in line: zip_code = line[station] search_distance = 1.5 max_price = 2500 cl_h = CraigslistHousing(site='toronto', area='tor', category='apa', filters={ 'zip_code': zip_code, 'search_distance': search_distance, 'posted_today': True, 'has_image': True, 'max_price': max_price }) results = cl_h.get_results(sort_by='newest', geotagged=True) write_results_of_search_to_csv(results, station)
def get_apt_results(self, zip_code='01923', radius=20, max_price=1600, sub_category=None, overwrite=False): cl = CraigslistHousing(site=self.site.lower(), category=sub_category + '/aap', filters={ 'zip_code': zip_code, 'search_distance': radius, 'min_price': 500, 'max_price': max_price }) results = cl.get_results() # If data file already exists, only update it with new data (by grabbing latest date) fname = 'Apartments_' + self.site + 'Craigslist.csv' if not overwrite and os.path.isfile(".\\Data\\" + fname): with open(".\\Data\\" + fname) as f: self.last_update = f.readlines()[1].split(',')[2] print("Grabbing data after " + self.last_update) ads_info = [] for result in results: print len(ads_info) # Some indication of progress ad_info = {} def get_attr(ad, attr): try: return ad[attr] except: return '' ad_info['Title'] = get_attr(result, 'name') ad_info['Area'] = get_attr(result, 'area') ad_info['Bedrooms'] = get_attr(result, 'bedrooms') ad_info['Link'] = get_attr(result, 'url') ad_info['Price'] = get_attr(result, 'price') ad_info['Location'] = get_attr(result, 'geotag') ad_info['Date'] = get_attr(result, 'datetime') if self.last_update: if dt.strptime(ad_info['Date'], "%Y-%m-%d %H:%M") <= dt.strptime( self.last_update, "%Y-%m-%d %H:%M:%S"): break # If we already have the data, dont grab it again - stop the process, since its sorted by date ads_info.append(ad_info) # Save data to csv file if len(ads_info) > 0: if os.path.isfile(".\\Data\\" + fname) and not overwrite: temp_df = pd.read_csv(".\\Data\\" + fname) temp_df = temp_df.append(ads_info) write_to_csv(temp_df, fname) else: write_to_csv(ads_info, fname)
def post(self, request, *args, **kwargs): data = request.data city = data['city'] max_price = data['max_price'] city = city.lower() max_price = int(max_price) cl = CraigslistHousing(site=city, category='apa', filters={'max_price': max_price}) results = cl.get_results(sort_by='newest', geotagged=True, limit=5) return JsonResponse(results)
def get_craigslist(): global craigslist_found, craigslist_matched print("\n") print("CRAIGSLIST RESULTS") print("================================") cl_h = CraigslistHousing(site='bend', category='apa', filters={'max_price': max_price}) for result in cl_h.get_results(sort_by='newest', geotagged=True, limit=15): # print(result) craigslist_found += 1 show = True if "where" in result: for loc in [ "prineville", "la pine", "redmond", "john day", "chemult", "crescent lake" ]: try: if loc in result["where"].lower(): show = False break except Exception: pass if "name" in result: for loc in [ "prineville", "la pine", "redmond", "john day", "chemult", "crescent lake" ]: try: if loc in result["name"].lower(): show = False break except Exception: pass if show: craigslist_matched += 1 keys = [ "datetime", "price", "name", "where", "bedrooms", "area", "url" ] for key in keys: if key in result: print("{}: {}".format(key.upper(), result[key])) print("\n") print("__________________________________") print("Listings Found: {}".format(craigslist_found)) print("Listings Matched: {}".format(craigslist_matched))
def extract(site, category, today=False): cl_h = CraigslistHousing( site=site, category=category, filters=dict(posted_today=today, has_image=True, bundle_duplicates=True), ) results = cl_h.get_results(sort_by="newest", geotagged=True, include_details=True) results = [i for i in tqdm(results)] with open("cache.json", "w") as f: json.dump(results, f)
def add_rooms(loc): cl_rooms = CraigslistHousing(site="sandiego", filters={ 'private_room': True, 'min_price': 25, 'max_price': 3500 }) cl_rooms.set_logger(DEBUG) rooms = cl_rooms.get_results(limit=CL_RESULTS, geotagged=True, include_details=True) for i, room in enumerate(rooms): if i % 100 == 0: print('{}th room'.format(i)) rental_room = RentalRoom() db_room = session.query(RentalRoom).filter( RentalRoom.cl_id == room['id']).first() if db_room is not None: print("found existing place") continue else: rental_room.cl_id = room['id'] rental_room.repost_of_id = room['repost_of'] rental_room.url = room['url'] rental_room.date_updated = datetime.strptime( room['last_updated'], CL_DATE_FORMAT) rental_room.price = int(room['price'].replace('$', '').replace(',', '')) rental_room.state = loc['state'] rental_room.metro = loc['city'] if room.get('area'): rental_room.sqft = room['area'].replace('ft2', '') rental_room.named_location = room['where'] if room.get('geotag'): rental_room.coords = str(room['geotag'][0]) + ',' + str( room['geotag'][1]) else: continue rental_room.housing_type = room['house_type'] rental_room.laundry_type = room['laundry_type'] rental_room.parking_type = room['parking_type'] rental_room.furnished = room['furnished'] rental_room.cats_allowed = room['cats_ok'] rental_room.dogs_allowed = room['dogs_ok'] rental_room.title = room['name'] rental_room.details = room['body'] session.add(rental_room) session.commit()
def scrape(): sc = SlackClient(private.SLACK_TOKEN) cl = CraigslistHousing(site='sfbay', area='sfc', category='apa', filters={'max_price': settings.MAX_PRICE}) results = cl.get_results(sort_by='newest', geotagged=True, limit=20) for result in results: # Check if listing is already posted listing = session.query(Listing).filter_by(cl_id=result['id']).first() if listing is None: # If there is no string identifying which neighborhood the result is from, skip it. if result["where"] is None: continue area_found = False area = "" geotag = result["geotag"] # check with our bounding boxes if geotag is not None: for a, coords in settings.BOXES.items(): if in_box(geotag, coords): area = a area_found = True location = result["where"] if area_found == False and location is not None: for hood in settings.NEIGHBORHOODS: if hood in location.lower(): area = hood area_found = True if area_found: # Create listing object new_listing = Listing( link=result["url"], cl_id=result["id"] ) # Save listing so we don't grab it again session.add(new_listing) session.commit() # Post to slack channel desc = "{0} | {1} | {2} | {3}".format(area, result["price"], result["name"], result["url"]) sc.api_call( "chat.postMessage", channel=settings.SLACK_CHANNEL, text=desc, username="******", icon_emoji=":robot_face:" )
def scrape_craigslist_housing(): listings = [] craigslist_housing = CraigslistHousing( site=CONFIG.SITE, area=CONFIG.AREA, category=CONFIG.CATEGORY, filters=CONFIG.FILTERS, ) results = craigslist_housing.get_results(sort_by="newest", geotagged=True, limit=20) for result in results: logging.info(f'{time.ctime()}: Processing cl_id={result["id"]}') listing = session.query(Listing).filter_by(cl_id=result["id"]).first() if listing: logging.info(f"{time.ctime()}: cl_id={result['id']} Already in db") continue lat, lon = result.get("geotag", (None, None)) listing = Listing( cl_id=result["id"], cl_site=CONFIG.SITE, cl_area=CONFIG.AREA, cl_category=CONFIG.CATEGORY, url=result["url"], name=result["name"], price=to_numeric(result.get("price", "").replace("$", ""), float), area=to_numeric( str(result.get("area", "")).replace("ft2", ""), float), bedrooms=result["bedrooms"], location=result["where"], geotag=f"({lat},{lon})", lat=to_numeric(lat, float), lon=to_numeric(lon, float), has_image=result["has_image"], has_map=result["has_map"], created=parse(result["datetime"]), ) logging.info(f"{time.ctime()}: Saving cl_id={listing.cl_id}") session.add(listing) session.commit() listings.append(listing) return listings
def search_craigslist_for_houses(zipcode, openlinks, email, printheader): #Declare some initial variables msg = [] #Pull data based on parameters house_1 = CraigslistHousing(site='philadelphia', category='housing', filters={ 'zip_code': zipcode, 'search_distance': 1, 'min_bedrooms': 2, 'min_price': 0, 'max_price': 2700, 'min_ft2': 1200, 'cats_ok': True, 'query': 'parking' }) #Print title if desired if printheader == True: header = '' header = str(zipcode) + ' Apartments/Houses' print(header) #Loop through the results for x in house_1.get_results(): #Also only return results that are not in the already_checked list if (x['url'] in already_checked) == False: msg.append('Posted: {} Price: {} Link: {}'.format( x['datetime'], x['price'], x['url'])) email_msg.append('Posted: {} Price: {} Link: {}'.format( x['datetime'], x['price'], x['url'])) #Temporary holding of URL from the search results as a list to use as filter against the already_checked list temp.append(x['url']) #Open urls in webbrowser if desired if openlinks == True: webbrowser.open(x['url']) #Send email with search results if desired and search results exist if email == True and len(msg) > 0: send_email('\n'.join(msg), '*****@*****.**', ['*****@*****.**'], header) #Print out the search results to the console for _ in msg: print(_)
def craigs_list_api_call(): # This function returns the result of the listings based on site and category # The search URL is as below # https://toronto.craigslist.org/search/tor/apa? cl_tor_housing = CraigslistHousing(site='toronto', area='tor', category='apa', filters={'bundle_duplicates': 1}) #If geotagged=True, the results will include the (lat, lng) in the 'geotag' attrib (this will make the process a little bit longer). craiglist_housing = [] for result in cl_tor_housing.get_results(sort_by='newest', geotagged=True): craiglist_housing.append(result) print("Finished craigs_list_api_call") return craiglist_housing
def getHousingPosts(self, limit=None): site = self.site if limit == None: limit = self.limit #cl_h = CraigslistHousing(site=site, area='sfc', category='roo', cl_h = CraigslistHousing(site=site, category='roo', filters={'max_price': 1200, 'private_room': True}) for result in cl_h.get_results(sort_by='newest', limit=limit, geotagged=True): #print result if 'geotag' not in result: print "***** Missing geotag" continue rec = dict(result) rec['recType'] = 'housing' self.recs.append(rec) if self.autoSave: self.save()
def get_data(site="newyork", area=None, category="abo", limit=25, geotagged=True): """ scrape the data and return a pandas df """ cl = CraigslistHousing(site=site, area=area, category=category) results = cl.get_results(sort_by='newest', limit=limit, geotagged=geotagged) data = pd.DataFrame(results) data.index = data["id"] data["area"] = site + area if area else site return (data)
def scrape_housing(): # Scrape Craigslist for listings. cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area='nvn', category=settings.CRAIGSLIST_HOUSING_SECTION, filters={'min_price': settings.MIN_PRICE_RENT, 'max_price': settings.MAX_PRICE_RENT}) results = [] for result in cl_h.get_results(sort_by='newest', geotagged=True, limit=settings.LIMIT, include_details=True): results.append(result) # Filter scraped results for excluded terms. good_listings = [] x = 0 for result in results: for term in private.EXCLUDED_TERMS: if term in result['body'].lower(): break else: listing = session.query(Listing).filter_by( cl_id=result["id"]).first() # Don't store the listing if it already exists. if listing is None: good_listings.append(result) listing = Listing( cl_id=result['id'], link=result['url'], created=parse(result['datetime']), name=result['name'], price=f"${format(float(result['price'][1:]), ',.0f')} CAD", location=result['where'], sqft=result['area'], body=result['body'] ) x += 1 # Save the listing so we don't grab it again. session.add(listing) session.commit() print(f'{time.ctime()}: Found {x} new listings that contained excluded terms.') # Create slack client. sc = SlackClient(settings.SLACK_TOKEN) # Post each result to Slack. for listing in good_listings: post_listing_to_slack(sc, listing)
def _build(self, site, area, category=None, filters=None): """Builds the housing object """ housing = CraigslistHousing(site=site, area=area, category=category, filters=filters) return housing
def lambda_handler(event, context): # Connect to craigslist cl = CraigslistHousing(site='newyork', area=None, category='aap') # Pull data from Craigslist results = cl.get_results(sort_by='newest', geotagged=True, limit=200) resultsList = [result for result in results] # Convert data to json data = json.dumps(resultsList[0]) # Get the current datetime for the file name now = str(datetime.today()) # Export the data client = boto3.client('s3') response = client.put_object(Bucket='lazyapartment', Body=data, Key='rawdata/{}.json'.format(now))
def getListings(self, category='apa'): """ Fetches the listings from craigslist using the settings defined in settings.py and the given category string (see craiglist categories). The craigslist site is hardcoded to sandiego but easy to refactor later. :param category: :return: list of craigslist postings matching the criterias """ clh = CraigslistHousing(site=settings.CRAIGSLIST_SITE, category=category, filters={ 'max_price': settings.MAX_PRICE, 'search_distance': settings.MILES_RADIUS, 'zip_code': settings.ZIP_CODE }) # only need to fetch the last 20 for now # TODO get more postings on first run? return clh.get_results(sort_by='newest', geotagged=True, limit=1)
def scrape_for_apartments(): #get results from craiglist cl_h = CraigslistHousing(site=settings.CL_SITE, area=settings.CL_AREA, category=settings.CL_CATEGORY, filters={'bundle_duplicates': True, 'posted_today': settings.POSTED_TODAY, 'min_bedrooms': settings.MIN_NUM_BEDROOMS, 'max_bedrooms': settings.MAX_NUM_BEDROOMS, # 'cats_ok': settings.CATS_OK, 'max_price': settings.MAX_PRICE, 'min_price': settings.MIN_PRICE, 'laundry': settings.LAUNDRY_OPTIONS#, #'parking': settings.PARKING_OPTIONS #'housing_type': settings.HOUSING_TYPE }) #adding a counter to limit the amount of results that can be sent at one time counter = 0 for result in cl_h.get_results(sort_by='newest', geotagged=True): if check_for_record(result): continue else: counter += 1 geotag = result["geotag"] #set blank area area = "" for a, coords in settings.AREAS.items(): print(result); if geotag is not None and in_area(geotag, coords): area = a #couldn't find from Geotag, string search the listing if area == "": # print settings.NEIGHBORHOODS for hood in settings.NEIGHBORHOODS: if result["where"] is not None and hood in result["where"].lower(): area = hood if area != '' and counter < 10: store_in_db(result) client = Client(settings.ACCOUNT_SID, settings.AUTH_TOKEN) text = "{} per month in {}.\n {}".format(result['price'], result['where'], result["url"]) message = client.messages.create( messaging_service_sid=settings.MS_SID, body=text, to=settings.TARGET_PHONE_NUMBER)
def __init__(self, config): """ Set instance attributes and create CraigslistHousing object Args: config (class): contains configuration data """ self.config = config self.cl_h = CraigslistHousing( site=config.CRAIGSLIST_SITE, area=config.CRAIGSLIST_AREA, category=config.CRAIGSLIST_HOUSING_SECTION, filters={ 'has_image': True, 'search_distance': config.SEARCH_DISTANCE, 'zip_code': config.ZIP_CODE, 'max_price': config.MAX_PRICE, 'min_bedrooms': config.MIN_BEDS, 'laundry': ['w/d in unit'], # 'cats_ok': True, })
def query_data(housing_category, geotag): """Function to apply housing filters and instantiate craigslist.CraigslistHousing object with appropriate data.""" search_filters = utils.search_filters() try: housing_object = CraigslistHousing( category=housing_category, filters=search_filters ) return mine_data(housing_object, housing_category, geotag) except requests.exceptions.ConnectionError: return None
def scrape_craigslist(max_price=10000, min_price=1000, limit=None): cl = CraigslistHousing(site='sfbay', area='sfc', category='apa', filters={ 'max_price': max_price, 'min_price': min_price, 'private_room': True, 'posted_today': True }) listings = [] for result in cl.get_results(sort_by='newest', geotagged=True, limit=limit): bedrooms = int( result['bedrooms']) if result['bedrooms'] is not None else 0 location = result['bedrooms'] # filter for only studios or 1 bedrooms or 2 bedrooms if bedrooms > 2: continue listings.append(ApartmentListing.from_dict(result)) return listings
def get_rental_comps_craigslist(self, address, city, zipcode, limit, bd, ba, sqft): if not address or not city or not zipcode or not limit: return None rents = [] geocode = get_geocode_from_address(address) if not geocode or not bd or not ba or not sqft: return None try: cl_h = CraigslistHousing( site=city.lower(), category="apa", filters={ 'zip_code': zipcode, 'search_distance': limit, 'min_bedrooms': bd, 'max_bedrooms': bd, "min_bathrooms": ba, "max_bathrooms": ba, "min_ft2": max(0, sqft - 300), "max_ft2": sqft + 300, 'housing_type': ['apartment', 'condo', 'house', 'townhouse'] }) for result in cl_h.get_results(geotagged=True): dist = get_distance_bw_geocodes(geocode, result["geotag"]) if dist < limit: rents.append(int(re.sub("[^0-9]", "", result["price"]))) except: return None if len(rents) > 1: return sum(rents) / float(len(rents)) return None
def main_features(site, area, category, sort_by, limit, geotagged): # Use Craigslist package cl = CraigslistHousing(site=site, area=area, category=category) results = cl.get_results(sort_by=sort_by, geotagged=geotagged, limit=limit) df = { 'id': [], 'repost_of': [], 'name': [], 'url': [], 'datetime': [], 'last_updated': [], 'price': [], 'where_': [], 'has_image': [], 'latitude': [], 'longitude': [] } for result in results: df['id'].append(result['id']) df['repost_of'].append(result['repost_of']) df['name'].append(result['name']) df['url'].append(result['url']) df['datetime'].append(result['datetime']) df['last_updated'].append(result['last_updated']) df['price'].append(result['price'][1:]) df['where_'].append(result['where']) df['has_image'].append(result['has_image']) if result['geotag'] == None: df['latitude'].append(0.0) df['longitude'].append(0.0) else: df['latitude'].append(result['geotag'][0]) df['longitude'].append(result['geotag'][1]) df = pd.DataFrame(df) df['price'] = pd.to_numeric(df['price'].str.replace(',', '')) return (df)
def query_housing_data(state, reg, sub_reg, housing_cat, geotag): """A function to apply housing filters and instantiate craigslist.CraigslistHousing object with appropriate data.""" search_filters = get_static_file.search_filters() if sub_reg: housing_object = CraigslistHousing(site=reg, area=sub_reg, category=housing_cat, filters=search_filters) return mine_housing_data(housing_object, state, reg, housing_cat, geotag, sub_reg=sub_reg) else: housing_object = CraigslistHousing(site=reg, category=housing_cat, filters=search_filters) return mine_housing_data(housing_object, state, reg, housing_cat, geotag)
def scrape_area(area): """ Scrapes craigslist for certain geographic area, and finds latest listings :param area: :return: A list of results. """ cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SECTION, filters={"max_price": settings.MAX_PRICE, "min_price": settings.MIN_PRICE}) results = [] gen = cl_h.get_results(sort_by="newest", geotagged=True, limit=20) while True: try: result = next(gen) except StopIteration: break except Exception: continue listing = session.query(Listing).filter_by(cl_id=result["id"]).first() #dont store listing if it already exists if listing is None: if result["where"] is None: #if there is no string identifying which neighborhood result is from skip it continue lat = 0 lon = 0 if result["geotag"] is not None: #assign coordinates lat = result["geotag"][0] lon = result["geotag"][1] #annotate result with info about area its in and points of interest near it geo_data = find_points_of_interest(result["geotag"], result["where"]) result.update(geo_data) else: result["area"] = "" result["ttc"] = "" #try parsing price price = 0 try: price = float(result["price"].replace("$","")) except Exception: pass #create listing object listing = Listing( link=result["url"], created=parse(result["datetime"]), lat=lat, lon=lon, name=result["name"], price=price, location=result["where"], cl_id=result["id"], area=result["area"], ttc_stop=result["ttc"] ) #save listing so we dont grab it again session.add(listing) session.commit() #return result if its near ttc station or if in area defined if len(result["ttc"]) > 0 or len(result["area"]) > 0: results.append(result) return results
def scrape(site, area, category, min_price, max_price): """ Scrapes craigslist for a certain geographic area, and finds the latest listings. :param site: :param area: :param category: :param min_price: :param max_price: :return: A list of results. """ results = [] cl_h = CraigslistHousing( site=site, area=area, category=category, filters={'min_price': min_price, 'max_price': max_price} ) gen = cl_h.get_results( sort_by='newest', geotagged=True, limit=20 ) while True: try: result = next(gen) except StopIteration: break except Exception: continue listing = session.query(Listing).filter_by(cl_id=result['id']).first() # Don't store the listing if it already exists. if listing is None: if result['where'] is None: # If there is no string identifying which neighborhood the # result is from, skip it. continue # Annotate the result with information about the area it's in and # points of interest near it. result.update( find_points_of_interest(result['geotag'], result['where']) ) lat = 0 lon = 0 if result['geotag'] is not None: # Assign the coordinates. lat = result['geotag'][0] lon = result['geotag'][1] # Try parsing the price. price = 0 try: price = float(result['price'].replace('$', '')) except (TypeError, ValueError): pass # Create the listing object. listing = Listing( link=result['url'], created=parse(result['datetime']), geotag=str(result['geotag']), lat=lat, lon=lon, name=result['name'], price=price, location=result['where'], cl_id=result['id'], neighborhood=result['neighborhood'], transit_stop=result['transit_stop'], shuttle_stop=result['shuttle_stop'] ) # Save the listing so we don't grab it again. session.add(listing) session.commit() # Return the result if it's near a shuttle stop and in a # desired neighborhood. Adjust requirements to your liking. if (result['shuttle_walk_time'] < settings.MAX_SHUTTLE_WALK_TIME and len(result['neighborhood']) > 0 and result['has_image'] and desirable(result['url'])): results.append(result) return results
def scrape_area(area): """ Scrapes craigslist for a certain geographic area, and finds the latest listings. :param area: :return: A list of results. """ cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SECTION, filters={'max_price': settings.MAX_PRICE, "min_price": settings.MIN_PRICE}) results = [] gen = cl_h.get_results(sort_by='newest', geotagged=True, limit=20) while True: try: result = next(gen) except StopIteration: break except Exception: continue listing = session.query(Listing).filter_by(cl_id=result["id"]).first() # Don't store the listing if it already exists. if listing is None: if result["where"] is None: # If there is no string identifying which neighborhood the result is from, skip it. continue lat = 0 lon = 0 if result["geotag"] is not None: # Assign the coordinates. lat = result["geotag"][0] lon = result["geotag"][1] # Annotate the result with information about the area it's in and points of interest near it. geo_data = find_points_of_interest(result["geotag"], result["where"]) result.update(geo_data) else: result["area"] = "" result["lrt"] = "" # Try parsing the price. price = 0 try: price = float(result["price"].replace("$", "")) except Exception: pass # Create the listing object. listing = Listing( link=result["url"], created=parse(result["datetime"]), lat=lat, lon=lon, name=result["name"], price=price, location=result["where"], cl_id=result["id"], area=result["area"], lrt_stop=result["lrt"] ) # Save the listing so we don't grab it again. session.add(listing) session.commit() # Return the result if it's near a lrt station. if len(result["lrt"]) > 0: results.append(result) return results