Exemple #1
0
def process_craigslist():
    """Pull data from craigslist."""
    from craigslist import CraigslistHousing

    cl_housing = CraigslistHousing(site=SearchConfig.SITE,
                                   area=SearchConfig.AREA,
                                   category=SearchConfig.CATEGORY,
                                   filters=SearchConfig.FILTERS)

    results = cl_housing.get_results(sort_by='newest',
                                     geotagged=True,
                                     limit=20)

    valid_results = {}

    for result in results:
        geotag = result["geotag"]

        for location, coords in SearchConfig.BOUNDING_AREAS.items():

            # make sure theres a list to append to
            if location not in valid_results.keys():
                valid_results[location] = []

            if geotag and in_box(geotag, coords):
                valid_results[location].append(result)
                break

    with open(SearchConfig.OUTFILE, 'w') as outfile:
        json.dump(valid_results, outfile, indent=2)
Exemple #2
0
def ScrapeNYC(area, limit, directory):
    os.chdir(directory)
    cl = CraigslistHousing(site='newyork', area=area)
    gen = cl.get_results(sort_by='newest', geotagged=True, limit=limit)

    t = []

    while True:
        try:
            result = next(gen)
        except StopIteration:
            break
        except Exception:
            continue
        t.append(result)
    df = pd.DataFrame(t)
    df_NAN = df.dropna(how='any')
    col_list = ['id', 'datetime', 'geotag', 'price']
    df_NAN = df_NAN[col_list]

    date = time.strftime("%m_%d_%Y")

    if os.path.isfile(date + '.csv'):
        df_NAN.to_csv(date + '.csv', mode='a', header=False, index=False)
    else:
        df_NAN.to_csv(date + '.csv', index=False)
Exemple #3
0
def main():
    # get the data from Craigslist
    housing = CraigslistHousing(site='sfbay',
                                area='sfc',
                                category='apa',
                                filters={
                                    'posted_today': True,
                                    'min_price': settings.min_price,
                                    'max_price': settings.max_price,
                                    'min_bedrooms': settings.min_bedrooms
                                })

    log.info('Retrieving listings')
    for result in housing.get_results(sort_by='newest', geotagged=True):
        # result = {'id': '6902060582', 'repost_of': None, 'name': 'Spacious one bedroom apartment near USF& GG PK', 'url': 'https://sfbay.craigslist.org/sfc/apa/d/san-francisco-spacious-one-bedroom/6902060582.html', 'datetime': '2019-05-31 21:44', 'price': '$2950', 'where': 'inner richmond', 'has_image': True, 'has_map': True, 'geotag': (37.775905, -122.458591), 'bedrooms': '1', 'area': None}

        # create a `listing` dict with the fields I care about and process the result
        listing = {}
        listing['craigslist_id'] = result['id']
        listing['craigslist_url'] = result['url']
        listing['posted_on'] = result['datetime']
        listing['description'] = result['name']
        listing['price'] = int(
            result['price'][1:]
        )  # price always has a leading '$' so we need to strip the leading character
        listing['neighborhood'] = str.lower(
            result['where']
        ) if result['where'] else ''  # sometimes this is null
        listing['num_bedrooms'] = result['bedrooms']
        listing['sqft'] = result['area']
        listing['latitude'] = result['geotag'][0]
        listing['longitude'] = result['geotag'][1]

        # decide if we want to notify about this listing
        # https://stackoverflow.com/questions/2783969/compare-string-with-all-values-in-array
        if any(x in listing['neighborhood']
               for x in settings.neighborhood_blacklist):
            notify = False
        else:
            notify = True

        # check if the listing is a duplicate
        if database.get_record(listing['craigslist_id']):
            log.info('Found duplicate record with ID {}, skipping'.format(
                listing['craigslist_id']))
            continue  # if duplicate we assume we've procsessed this listing so just skip it
        # otherwise we should save the listing and notify if applicable
        else:
            log.info('{} looks like a new listing, processing'.format(
                'craigslist_id'))

            # get the map image from Mapbox
            # we do this here instead of above to limit the number of API requests made to Mapbox
            listing['map_image'] = get_map(listing['latitude'],
                                           listing['longitude'])

            database.insert_record(listing)
            if notify is True:
                send_notification(listing)
                database.mark_as_notified(listing['craigslist_id'])
Exemple #4
0
def scrape_craigslist(max_price=10000,
                      min_price=1000,
                      limit=None,
                      site='sfbay',
                      area='sfc'):
    """Returns a list of craigslist postings that were posted in the past day
    filtering for the given  Craigslist area. Available areas in the Bay Area 
    include 'sfc' (the city), 'sby' (South Bay), 'eby' (East Bay), 'pen' 
    (Peninsula), 'nby' (North Bay), 'scz' (Santa Cruz)"""
    cl = CraigslistHousing(site=site,
                           area=area,
                           category='apa',
                           filters={
                               'max_price': max_price,
                               'min_price': min_price,
                               'private_room': True,
                               'posted_today': True
                           })
    listings = []
    for result in cl.get_results(sort_by='newest', geotagged=True,
                                 limit=limit):
        bedrooms = int(
            result['bedrooms']) if result['bedrooms'] is not None else 0
        location = result['bedrooms']
        # filter for only studios or 1 bedrooms or 2 bedrooms
        if bedrooms > 2:
            continue
        listings.append(ApartmentListing.from_dict(result))
    return listings
Exemple #5
0
def scrape_area(area):
    """
    Scrapes craigslist for newest listings in area
    :param area:
    :return: A list of results.
    """
    cl = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SUBSECTION,
                             filters={'max_price': settings.MAX_PRICE, "min_price": settings.MIN_PRICE})

    results = []
    some = cl.get_results(sort_by='newest', geotagged=True, limit=50)
    while True:
        try:
            result = next(some)
        except StopIteration:
            break
        except Exception:
            continue

        if result["geotag"] is None:
            continue

        # Parse the price.
        price = 0
        try:
            price = float(result["price"].replace("$", ""))
        except Exception:
            pass

        if check_commutes(result) :
            results.append(result)
    
    return results
def collect_clist_data():
    '''docstring for collect_clist_data'''
    cl_h = CraigslistHousing(site='sfbay',
                             area='sfc',
                             filters={
                                 'min_price': 1000,
                                 'max_price': 6000,
                                 'search_distance': 4,
                                 'zip_code': 94115,
                                 'posted_today': True
                             })

    i = 0
    dfs = []
    logger.info('parsing results')
    for result in cl_h.get_results(sort_by='newest',
                                   geotagged=True,
                                   include_details=True):
        if i % 50 == 0:
            logger.info('get results for row ' + str(i))
        temp = pd.DataFrame(list(result.items())).T
        cols = list(temp.iloc[0])
        temp.columns = cols
        temp = temp.iloc[-1]
        temp = pd.DataFrame(temp).T
        dfs.append(temp)
        i = i + 1

    logger.info(str(i + 1) + ' listings collected')
    df = pd.concat(dfs, sort=False)
    df['script_timestamp'] = dt.datetime.now()

    ndf = search_cl.clean_clist_df(df)
    return ndf
def lambda_handler(event, context):

	# Pull in environmental variable for number of posts to pull
	number_of_posts = os.environ.get("number_of_posts")

	# Instantiate our Craigslist scraper
	cl = CraigslistHousing(site='newyork', 
							area=None, 
							category='aap')

	# Pull data from Craigslist and put into a list
	results = cl.get_results(sort_by='newest', geotagged=True, limit=5)
	resultsList = [result for result in results]
	
	# Convert data to json
	data = json.dumps(resultsList)

	# Get the current datetime for the file name
	now = str(datetime.today())
	
	# Export the data to S3
	client = boto3.client('s3')
	response = client.put_object(Bucket='lazyapartment', 
					Body=data, 
					Key='rawdata/{}.json'.format(now))
    def housing(citi_code, category_code):

        cl_h = CraigslistHousing(site=citi_code, category=category_code, filters={'posted_today': True})

        for result in cl_h.get_results(sort_by='newest', geotagged=True):
            id = str(result["id"]).replace(",", "")
            name = str(result["name"]).replace(",", "")
            url = str(result["url"]).replace(",", "")
            date_time = str(result["datetime"]).replace(",", "")
            last_update = str(result["last_updated"]).replace(",", "")
            price = str(result["price"]).replace(",", "")
            location = str(result["where"]).replace(",", "")
            geolocation = str(result["geotag"]).replace(",", " and ")

            asd = requests.get(result["url"])
            time.sleep(2)
            soup = BeautifulSoup(asd.text, "html.parser")
            bsd = soup.find('section', {'id': 'postingbody'})
            discription = bsd.text.replace("\n", " ").replace(",", "").strip("  ").strip(
                "QR Code Link to This Post      ")
            phone_number = re.findall("(\d{3}[-\.\s]\d{3}[-\.\s]\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]\d{4}|\d{3}[-\.\s]\d{4})", discription)
            PhoneNumber = ''
            for phone in phone_number:
                PhoneNumber += phone + "/"
            emails = re.findall("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", discription)
            Emails = ''
            for email in emails:
                Emails += email + "/"


            to_write = id + "," + name + "," +PhoneNumber.replace(",","")+ "," +Emails.replace(",","")+ "," + discription + "," + url + "," + date_time + "," + last_update + "," + price + "," + location + "," + geolocation + "\n"
            write_to_file.write(to_write)
            print(to_write)
        write_to_file.close()
def search_and_write_to_csv():
    # for loop taking in the defined zip codes, if not set, use a default list
    # default list could be the zip codes of each underground station

    # west to east starting Dundas West
    green_line = {
        "DUNDAS WEST": "M6P 1W7",
        "DUFFERIN": "M6H 4E6",
        "CHRISTIE": "M6G 3B1",
        "BAY": "M5R 3N7"
    }

    # north east to south to north west
    yellow_line = {
        "LAWRENCE": "M4N 1S1",
        "EGLINTON": "M4S 2B8",
        "DAVISVILLE": "M4S 1Z2",
        "ST CLAIR": "M4T 1J8",
        "SUMMERHILL": "M4T 1W2",
        "ROSEDALE": "M4W 1T1",
        "BLOOR-YONGE": "M4W 1A8",
        "WELLESLEY": "M4Y 1G3",
        "COLLEGE": "M5B 1L2",
        "DUNDAS": "M5G 1Z3",
        "QUEEN": "M5C 2X9",
        "KING": "M5H 1A1",
        "UNION": "M5J 1E6",
        "ST ANDREW": "M5H 3T4",
        "OSGOODE": "M5H 3E5",
        "ST PATRICK": "M5G 1V1",
        "QUEENS PARK": "M5G 1X7",
        "MUSEUM": "M5S 2C5",
        "ST GEORGE": "M5R 2L8",
        "SPADINA": "M5R 2T6",
        "DUPONT": "M5R 1V7",
        "ST CLAIR WEST": "M5P 3N3"
    }

    lines = [green_line, yellow_line]

    for line in lines:
        print("Processing", line)
        for station in line:
            zip_code = line[station]
            search_distance = 1.5
            max_price = 2500
            cl_h = CraigslistHousing(site='toronto',
                                     area='tor',
                                     category='apa',
                                     filters={
                                         'zip_code': zip_code,
                                         'search_distance': search_distance,
                                         'posted_today': True,
                                         'has_image': True,
                                         'max_price': max_price
                                     })
            results = cl_h.get_results(sort_by='newest', geotagged=True)
            write_results_of_search_to_csv(results, station)
    def get_apt_results(self,
                        zip_code='01923',
                        radius=20,
                        max_price=1600,
                        sub_category=None,
                        overwrite=False):
        cl = CraigslistHousing(site=self.site.lower(),
                               category=sub_category + '/aap',
                               filters={
                                   'zip_code': zip_code,
                                   'search_distance': radius,
                                   'min_price': 500,
                                   'max_price': max_price
                               })
        results = cl.get_results()

        # If data file already exists, only update it with new data (by grabbing latest date)
        fname = 'Apartments_' + self.site + 'Craigslist.csv'
        if not overwrite and os.path.isfile(".\\Data\\" + fname):
            with open(".\\Data\\" + fname) as f:
                self.last_update = f.readlines()[1].split(',')[2]
                print("Grabbing data after " + self.last_update)

        ads_info = []
        for result in results:
            print len(ads_info)  # Some indication of progress
            ad_info = {}

            def get_attr(ad, attr):
                try:
                    return ad[attr]
                except:
                    return ''

            ad_info['Title'] = get_attr(result, 'name')
            ad_info['Area'] = get_attr(result, 'area')
            ad_info['Bedrooms'] = get_attr(result, 'bedrooms')
            ad_info['Link'] = get_attr(result, 'url')
            ad_info['Price'] = get_attr(result, 'price')
            ad_info['Location'] = get_attr(result, 'geotag')
            ad_info['Date'] = get_attr(result, 'datetime')

            if self.last_update:
                if dt.strptime(ad_info['Date'],
                               "%Y-%m-%d %H:%M") <= dt.strptime(
                                   self.last_update, "%Y-%m-%d %H:%M:%S"):
                    break  # If we already have the data, dont grab it again - stop the process, since its sorted by date

            ads_info.append(ad_info)

        # Save data to csv file
        if len(ads_info) > 0:
            if os.path.isfile(".\\Data\\" + fname) and not overwrite:
                temp_df = pd.read_csv(".\\Data\\" + fname)
                temp_df = temp_df.append(ads_info)
                write_to_csv(temp_df, fname)
            else:
                write_to_csv(ads_info, fname)
    def post(self, request, *args, **kwargs):
        data = request.data
        city = data['city']
        max_price = data['max_price']
        city = city.lower()
        max_price = int(max_price)
        cl = CraigslistHousing(site=city,
                               category='apa',
                               filters={'max_price': max_price})
        results = cl.get_results(sort_by='newest', geotagged=True, limit=5)

        return JsonResponse(results)
def get_craigslist():
    global craigslist_found, craigslist_matched

    print("\n")
    print("CRAIGSLIST RESULTS")
    print("================================")

    cl_h = CraigslistHousing(site='bend',
                             category='apa',
                             filters={'max_price': max_price})
    for result in cl_h.get_results(sort_by='newest', geotagged=True, limit=15):
        # print(result)
        craigslist_found += 1
        show = True
        if "where" in result:
            for loc in [
                    "prineville", "la pine", "redmond", "john day", "chemult",
                    "crescent lake"
            ]:
                try:
                    if loc in result["where"].lower():
                        show = False
                        break
                except Exception:
                    pass
        if "name" in result:
            for loc in [
                    "prineville", "la pine", "redmond", "john day", "chemult",
                    "crescent lake"
            ]:
                try:
                    if loc in result["name"].lower():
                        show = False
                        break
                except Exception:
                    pass

        if show:
            craigslist_matched += 1
            keys = [
                "datetime", "price", "name", "where", "bedrooms", "area", "url"
            ]
            for key in keys:
                if key in result:
                    print("{}: {}".format(key.upper(), result[key]))

        print("\n")
        print("__________________________________")

    print("Listings Found: {}".format(craigslist_found))
    print("Listings Matched: {}".format(craigslist_matched))
Exemple #13
0
def scrape():
	sc = SlackClient(private.SLACK_TOKEN)
	cl = CraigslistHousing(site='sfbay', area='sfc', category='apa',
		filters={'max_price': settings.MAX_PRICE})

	results = cl.get_results(sort_by='newest', geotagged=True, limit=20)
	for result in results:
		# Check if listing is already posted
		listing = session.query(Listing).filter_by(cl_id=result['id']).first()

		if listing is None:
			# If there is no string identifying which neighborhood the result is from, skip it.
			if result["where"] is None:
				continue

			area_found = False
			area = ""
			geotag = result["geotag"]
			# check with our bounding boxes
			if geotag is not None:
				for a, coords in settings.BOXES.items():

					if in_box(geotag, coords):
						area = a
						area_found = True

			location = result["where"]
			if area_found == False and location is not None:
				for hood in settings.NEIGHBORHOODS:
					if hood in location.lower():
						area = hood
						area_found = True

			if area_found:
				# Create listing object
				new_listing = Listing(
					link=result["url"],
					cl_id=result["id"]
				)

				# Save listing so we don't grab it again
				session.add(new_listing)
				session.commit()

				# Post to slack channel
				desc = "{0} | {1} | {2} | {3}".format(area, result["price"], result["name"], result["url"])
				sc.api_call(
						"chat.postMessage", channel=settings.SLACK_CHANNEL, text=desc,
						username="******", icon_emoji=":robot_face:"
					)
Exemple #14
0
def add_rooms(loc):
    cl_rooms = CraigslistHousing(site="sandiego",
                                 filters={
                                     'private_room': True,
                                     'min_price': 25,
                                     'max_price': 3500
                                 })
    cl_rooms.set_logger(DEBUG)
    rooms = cl_rooms.get_results(limit=CL_RESULTS,
                                 geotagged=True,
                                 include_details=True)
    for i, room in enumerate(rooms):
        if i % 100 == 0:
            print('{}th room'.format(i))
        rental_room = RentalRoom()

        db_room = session.query(RentalRoom).filter(
            RentalRoom.cl_id == room['id']).first()
        if db_room is not None:
            print("found existing place")
            continue
        else:
            rental_room.cl_id = room['id']
            rental_room.repost_of_id = room['repost_of']
            rental_room.url = room['url']
            rental_room.date_updated = datetime.strptime(
                room['last_updated'], CL_DATE_FORMAT)
            rental_room.price = int(room['price'].replace('$',
                                                          '').replace(',', ''))
            rental_room.state = loc['state']
            rental_room.metro = loc['city']
            if room.get('area'):
                rental_room.sqft = room['area'].replace('ft2', '')
            rental_room.named_location = room['where']
            if room.get('geotag'):
                rental_room.coords = str(room['geotag'][0]) + ',' + str(
                    room['geotag'][1])
            else:
                continue
            rental_room.housing_type = room['house_type']
            rental_room.laundry_type = room['laundry_type']
            rental_room.parking_type = room['parking_type']
            rental_room.furnished = room['furnished']
            rental_room.cats_allowed = room['cats_ok']
            rental_room.dogs_allowed = room['dogs_ok']
            rental_room.title = room['name']
            rental_room.details = room['body']

            session.add(rental_room)
            session.commit()
Exemple #15
0
def extract(site, category, today=False):
    cl_h = CraigslistHousing(
        site=site,
        category=category,
        filters=dict(posted_today=today,
                     has_image=True,
                     bundle_duplicates=True),
    )
    results = cl_h.get_results(sort_by="newest",
                               geotagged=True,
                               include_details=True)
    results = [i for i in tqdm(results)]
    with open("cache.json", "w") as f:
        json.dump(results, f)
Exemple #16
0
def scrape_craigslist_housing():
    listings = []
    craigslist_housing = CraigslistHousing(
        site=CONFIG.SITE,
        area=CONFIG.AREA,
        category=CONFIG.CATEGORY,
        filters=CONFIG.FILTERS,
    )

    results = craigslist_housing.get_results(sort_by="newest",
                                             geotagged=True,
                                             limit=20)

    for result in results:
        logging.info(f'{time.ctime()}: Processing cl_id={result["id"]}')
        listing = session.query(Listing).filter_by(cl_id=result["id"]).first()

        if listing:
            logging.info(f"{time.ctime()}: cl_id={result['id']} Already in db")
            continue

        lat, lon = result.get("geotag", (None, None))

        listing = Listing(
            cl_id=result["id"],
            cl_site=CONFIG.SITE,
            cl_area=CONFIG.AREA,
            cl_category=CONFIG.CATEGORY,
            url=result["url"],
            name=result["name"],
            price=to_numeric(result.get("price", "").replace("$", ""), float),
            area=to_numeric(
                str(result.get("area", "")).replace("ft2", ""), float),
            bedrooms=result["bedrooms"],
            location=result["where"],
            geotag=f"({lat},{lon})",
            lat=to_numeric(lat, float),
            lon=to_numeric(lon, float),
            has_image=result["has_image"],
            has_map=result["has_map"],
            created=parse(result["datetime"]),
        )

        logging.info(f"{time.ctime()}: Saving cl_id={listing.cl_id}")
        session.add(listing)
        session.commit()
        listings.append(listing)
    return listings
Exemple #17
0
def search_craigslist_for_houses(zipcode, openlinks, email, printheader):

    #Declare some initial variables
    msg = []

    #Pull data based on parameters
    house_1 = CraigslistHousing(site='philadelphia',
                                category='housing',
                                filters={
                                    'zip_code': zipcode,
                                    'search_distance': 1,
                                    'min_bedrooms': 2,
                                    'min_price': 0,
                                    'max_price': 2700,
                                    'min_ft2': 1200,
                                    'cats_ok': True,
                                    'query': 'parking'
                                })
    #Print title if desired
    if printheader == True:
        header = ''
        header = str(zipcode) + ' Apartments/Houses'
        print(header)

    #Loop through the results
    for x in house_1.get_results():

        #Also only return results that are not in the already_checked list
        if (x['url'] in already_checked) == False:
            msg.append('Posted: {} Price: {} Link: {}'.format(
                x['datetime'], x['price'], x['url']))
            email_msg.append('Posted: {} Price: {} Link: {}'.format(
                x['datetime'], x['price'], x['url']))
            #Temporary holding of URL from the search results as a list to use as filter against the already_checked list
            temp.append(x['url'])

            #Open urls in webbrowser if desired
            if openlinks == True:
                webbrowser.open(x['url'])

    #Send email with search results if desired and search results exist
    if email == True and len(msg) > 0:
        send_email('\n'.join(msg), '*****@*****.**',
                   ['*****@*****.**'], header)

    #Print out the search results to the console
    for _ in msg:
        print(_)
Exemple #18
0
def craigs_list_api_call():
    # This function returns the result of the listings based on site and category
    # The search URL is as below
    # https://toronto.craigslist.org/search/tor/apa?
    cl_tor_housing = CraigslistHousing(site='toronto',
                                       area='tor',
                                       category='apa',
                                       filters={'bundle_duplicates': 1})

    #If geotagged=True, the results will include the (lat, lng) in the 'geotag' attrib (this will make the process a little bit longer).
    craiglist_housing = []

    for result in cl_tor_housing.get_results(sort_by='newest', geotagged=True):
        craiglist_housing.append(result)
    print("Finished craigs_list_api_call")
    return craiglist_housing
Exemple #19
0
def get_data(site="newyork",
             area=None,
             category="abo",
             limit=25,
             geotagged=True):
    """ scrape the data and return a pandas df """

    cl = CraigslistHousing(site=site, area=area, category=category)
    results = cl.get_results(sort_by='newest',
                             limit=limit,
                             geotagged=geotagged)

    data = pd.DataFrame(results)
    data.index = data["id"]

    data["area"] = site + area if area else site
    return (data)
 def getHousingPosts(self, limit=None):
     site = self.site
     if limit == None:
         limit = self.limit
     #cl_h = CraigslistHousing(site=site, area='sfc', category='roo',
     cl_h = CraigslistHousing(site=site, category='roo',
                              filters={'max_price': 1200, 'private_room': True})
     for result in cl_h.get_results(sort_by='newest', limit=limit, geotagged=True):
         #print result
         if 'geotag' not in result:
             print "***** Missing geotag"
             continue
         rec = dict(result)
         rec['recType'] = 'housing'
         self.recs.append(rec)
     if self.autoSave:
         self.save()
Exemple #21
0
def scrape_housing():
    # Scrape Craigslist for listings.
    cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area='nvn', category=settings.CRAIGSLIST_HOUSING_SECTION,
                             filters={'min_price': settings.MIN_PRICE_RENT, 'max_price': settings.MAX_PRICE_RENT})

    results = []
    for result in cl_h.get_results(sort_by='newest', geotagged=True, limit=settings.LIMIT, include_details=True):
        results.append(result)

    # Filter scraped results for excluded terms.
    good_listings = []
    x = 0
    for result in results:
        for term in private.EXCLUDED_TERMS:
            if term in result['body'].lower():
                break
        else:
            listing = session.query(Listing).filter_by(
                cl_id=result["id"]).first()
            # Don't store the listing if it already exists.
            if listing is None:
                good_listings.append(result)
                listing = Listing(
                    cl_id=result['id'],
                    link=result['url'],
                    created=parse(result['datetime']),
                    name=result['name'],
                    price=f"${format(float(result['price'][1:]), ',.0f')} CAD",
                    location=result['where'],
                    sqft=result['area'],
                    body=result['body']
                )
                x += 1

            # Save the listing so we don't grab it again.
            session.add(listing)
            session.commit()
    print(f'{time.ctime()}: Found {x} new listings that contained excluded terms.')

    # Create slack client.
    sc = SlackClient(settings.SLACK_TOKEN)

    # Post each result to Slack.
    for listing in good_listings:
        post_listing_to_slack(sc, listing)
Exemple #22
0
    def getListings(self, category='apa'):
        """
        Fetches the listings from craigslist using the settings defined in settings.py
        and the given category string (see craiglist categories).
        The craigslist site is hardcoded to sandiego but easy to refactor later.
        :param category:
        :return: list of craigslist postings matching the criterias
        """
        clh = CraigslistHousing(site=settings.CRAIGSLIST_SITE,
                                category=category,
                                filters={
                                    'max_price': settings.MAX_PRICE,
                                    'search_distance': settings.MILES_RADIUS,
                                    'zip_code': settings.ZIP_CODE
                                })

        # only need to fetch the last 20 for now
        # TODO get more postings on first run?
        return clh.get_results(sort_by='newest', geotagged=True, limit=1)
def lambda_handler(event, context):
    # Connect to craigslist
    cl = CraigslistHousing(site='newyork', area=None, category='aap')

    # Pull data from Craigslist
    results = cl.get_results(sort_by='newest', geotagged=True, limit=200)
    resultsList = [result for result in results]

    # Convert data to json
    data = json.dumps(resultsList[0])

    # Get the current datetime for the file name
    now = str(datetime.today())

    # Export the data
    client = boto3.client('s3')
    response = client.put_object(Bucket='lazyapartment',
                                 Body=data,
                                 Key='rawdata/{}.json'.format(now))
Exemple #24
0
def scrape_for_apartments():
    #get results from craiglist
    cl_h = CraigslistHousing(site=settings.CL_SITE, area=settings.CL_AREA, category=settings.CL_CATEGORY,
                             filters={'bundle_duplicates': True,
                                      'posted_today': settings.POSTED_TODAY,
                                      'min_bedrooms': settings.MIN_NUM_BEDROOMS,
                                      'max_bedrooms': settings.MAX_NUM_BEDROOMS,
                                      # 'cats_ok': settings.CATS_OK,
                                      'max_price': settings.MAX_PRICE,
                                      'min_price': settings.MIN_PRICE,
                                      'laundry': settings.LAUNDRY_OPTIONS#,
                                      #'parking': settings.PARKING_OPTIONS
                                      #'housing_type': settings.HOUSING_TYPE
                                      })
    #adding a counter to limit the amount of results that can be sent at one time
    counter = 0
    for result in cl_h.get_results(sort_by='newest', geotagged=True):
        if check_for_record(result):
            continue
        else:
            counter += 1
            geotag = result["geotag"]
            #set blank area
            area = ""
            for a, coords in settings.AREAS.items():
                print(result);
                if geotag is not None and in_area(geotag, coords):
                    area = a
            #couldn't find from Geotag, string search the listing
            if area == "":
                # print settings.NEIGHBORHOODS
                for hood in settings.NEIGHBORHOODS:
                    if result["where"] is not None and hood in result["where"].lower():
                        area = hood
            if area != '' and counter < 10:
                store_in_db(result)
                client = Client(settings.ACCOUNT_SID, settings.AUTH_TOKEN)
                text = "{} per month in {}.\n {}".format(result['price'], result['where'], result["url"])
                message = client.messages.create(
                                messaging_service_sid=settings.MS_SID,
                                body=text,
                                to=settings.TARGET_PHONE_NUMBER)
Exemple #25
0
class LocationFinder:

    def __init__(self, price, zip_code, query):
        self.cl_h = CraigslistHousing(site='cleveland', area='', category='apa', filters={'max_price': price, 'zip_code': zip_code, 'query': query})

    def find_addresses(self):
        i = 0
        locations = []
        for result in self.cl_h.get_results(sort_by='newest', geotagged=True):
            address = {}
            r = urllib.urlopen(result['url']).read()
            soup = BeautifulSoup(r, "html.parser")
            address_tag = soup.find_all("div", class_="mapaddress")
            if not address_tag:
                pass
            else:
                i = i + 1
                print('--------------------')
                print(address_tag[0].text)
                print(result['price'])
                print(result['name'])
                print(result['url'])
                print('--------------------\n')
                address['address'] = address_tag[0].text
                address['price'] = result['price']
                address['name'] = result['name']
                address['url'] = result['url']
                address['number'] = self.get_phone()
                locations.append(address)
            if i == 3:
                break
        return locations
    
    def get_phone(self):
        f=randint(111,950)
        b=randint(1001,9899)
        a=randint(100,999)
        num="("+str(a)+")-"+str(f)+"-"+str(b)
        z=randint(0,1)
        if (z == 0):
            return ""
        return num 
Exemple #26
0
def scrape_craigslist(max_price=10000, min_price=1000, limit=None):
    cl = CraigslistHousing(site='sfbay',
                           area='sfc',
                           category='apa',
                           filters={
                               'max_price': max_price,
                               'min_price': min_price,
                               'private_room': True,
                               'posted_today': True
                           })
    listings = []
    for result in cl.get_results(sort_by='newest', geotagged=True,
                                 limit=limit):
        bedrooms = int(
            result['bedrooms']) if result['bedrooms'] is not None else 0
        location = result['bedrooms']
        # filter for only studios or 1 bedrooms or 2 bedrooms
        if bedrooms > 2:
            continue
        listings.append(ApartmentListing.from_dict(result))
    return listings
Exemple #27
0
    def get_rental_comps_craigslist(self, address, city, zipcode, limit, bd,
                                    ba, sqft):
        if not address or not city or not zipcode or not limit:
            return None

        rents = []
        geocode = get_geocode_from_address(address)

        if not geocode or not bd or not ba or not sqft:
            return None

        try:
            cl_h = CraigslistHousing(
                site=city.lower(),
                category="apa",
                filters={
                    'zip_code': zipcode,
                    'search_distance': limit,
                    'min_bedrooms': bd,
                    'max_bedrooms': bd,
                    "min_bathrooms": ba,
                    "max_bathrooms": ba,
                    "min_ft2": max(0, sqft - 300),
                    "max_ft2": sqft + 300,
                    'housing_type':
                    ['apartment', 'condo', 'house', 'townhouse']
                })

            for result in cl_h.get_results(geotagged=True):
                dist = get_distance_bw_geocodes(geocode, result["geotag"])

                if dist < limit:
                    rents.append(int(re.sub("[^0-9]", "", result["price"])))
        except:
            return None

        if len(rents) > 1:
            return sum(rents) / float(len(rents))

        return None
Exemple #28
0
def main_features(site, area, category, sort_by, limit, geotagged):

    # Use Craigslist package
    cl = CraigslistHousing(site=site, area=area, category=category)
    results = cl.get_results(sort_by=sort_by, geotagged=geotagged, limit=limit)

    df = {
        'id': [],
        'repost_of': [],
        'name': [],
        'url': [],
        'datetime': [],
        'last_updated': [],
        'price': [],
        'where_': [],
        'has_image': [],
        'latitude': [],
        'longitude': []
    }

    for result in results:
        df['id'].append(result['id'])
        df['repost_of'].append(result['repost_of'])
        df['name'].append(result['name'])
        df['url'].append(result['url'])
        df['datetime'].append(result['datetime'])
        df['last_updated'].append(result['last_updated'])
        df['price'].append(result['price'][1:])
        df['where_'].append(result['where'])
        df['has_image'].append(result['has_image'])
        if result['geotag'] == None:
            df['latitude'].append(0.0)
            df['longitude'].append(0.0)
        else:
            df['latitude'].append(result['geotag'][0])
            df['longitude'].append(result['geotag'][1])

    df = pd.DataFrame(df)
    df['price'] = pd.to_numeric(df['price'].str.replace(',', ''))
    return (df)
 def fetch_craigslist_data(self):
     cl = CraigslistHousing(site=self.config["craigslist_site"],
                            area=self.config["craigslist_area"],
                            category="apa",
                            filters={
                                "max_price": self.config["max_price"],
                                "min_bedrooms": self.config["bed"],
                                "max_bedrooms": self.config["bed"]
                            })
     res = cl.get_results(sort_by='newest', geotagged=True, limit=3000)
     apts = []
     for apt in res:
         apts.append({
             "loc": apt["geotag"],
             "name": apt["name"],
             "url": apt["url"],
             "price": apt["price"],
             "neigh": apt["where"],
             "bed": self.config["bed"]
         })
     apts_filtered = self.filter(apts)
     return apts_filtered
    def getApartmentData(self):

        # Connect to craigslist
        if self.craigslistArea:
            cl = CraigslistHousing(site=self.craigslistSite,
                                   area=self.craigslistArea,
                                   category='aap')
        else:
            cl = CraigslistHousing(site=self.craigslistSite,
                                   area=self.craigslistArea,
                                   category='aap')

        # Pull data from Craigslist
        results = cl.get_results(sort_by='newest', geotagged=True, limit=5)
        resultsList = [result for result in results]
        df = pd.DataFrame(resultsList)

        # Split latitude and longitude
        df['latitude'] = df['geotag'].apply(lambda x: x[0]
                                            if type(x) == tuple else None)
        df['longitude'] = df['geotag'].apply(lambda x: x[1]
                                             if type(x) == tuple else None)

        # Clean up money
        df['price'] = df['price'].str.replace('$', '').str.replace(',', '')

        # Enrich the data with Mapquest and Walkscore data
        df = enrichMapquestData(df)
        df = enrichWalkScore(df)

        # Remove duplicates
        df.drop_duplicates(keep='last', inplace=True)
        # df.drop('geotag', axis=1, inplace=True)

        # Set index
        df = df.set_index('id')

        # Export the data
        df.to_csv('test.csv')
 def find_housing(self, price='2500', location='', cat='hhh', private=True):
     rentals = CraigslistHousing(site='sfbay',
                                 area=location,
                                 category=cat,
                                 filters={
                                     'max_price': price,
                                     'private_room': private
                                 })
     houses = rentals.get_results(sort_by='newest', geotagged=True)
     count = 0
     responses = []
     for house in houses:
         res_map = {
             "name": house['name'] if house['name'] else '',
             "url": house['url'] if house['url'] else '',
             "price": house['price'] if house['price'] else '',
             "location": house['where'] if house['where'] else '',
         }
         rental_id = filter(lambda x: x.isdigit(), res_map['url'])
         if rental_id not in self.rental_ids:
             self.rental_ids.add(int(rental_id))
             bot_response = {
                 "attachments": [{
                     "fallback": "Craigslist SF",
                     "color": "#36a64f",
                     "title": res_map['name'],
                     "title_link": res_map['url'],
                     "text": res_map['price'],
                     "fields": [{
                         "title": res_map['location']
                     }],
                     "footer": "Craigslist"
                 }]
             }
             responses.append(bot_response)
             count += 1
             if count > 25: break
     return responses
Exemple #32
0
def scrape(site, area, category, min_price, max_price):
    """
    Scrapes craigslist for a certain geographic area, and finds the latest 
    listings.
    :param site:
    :param area:
    :param category:
    :param min_price:
    :param max_price:
    :return: A list of results.
    """

    results = []

    cl_h = CraigslistHousing(
        site=site,
        area=area,
        category=category,
        filters={'min_price': min_price, 'max_price': max_price}
    )

    gen = cl_h.get_results(
        sort_by='newest',
        geotagged=True,
        limit=20
    )

    while True:
        try:
            result = next(gen)
        except StopIteration:
            break
        except Exception:
            continue

        listing = session.query(Listing).filter_by(cl_id=result['id']).first()

        # Don't store the listing if it already exists.
        if listing is None:
            if result['where'] is None:
                # If there is no string identifying which neighborhood the
                # result is from, skip it.
                continue

            # Annotate the result with information about the area it's in and
            # points of interest near it.
            result.update(
                find_points_of_interest(result['geotag'], result['where'])
            )

            lat = 0
            lon = 0
            if result['geotag'] is not None:
                # Assign the coordinates.
                lat = result['geotag'][0]
                lon = result['geotag'][1]

            # Try parsing the price.
            price = 0
            try:
                price = float(result['price'].replace('$', ''))
            except (TypeError, ValueError):
                pass

            # Create the listing object.
            listing = Listing(
                link=result['url'],
                created=parse(result['datetime']),
                geotag=str(result['geotag']),
                lat=lat,
                lon=lon,
                name=result['name'],
                price=price,
                location=result['where'],
                cl_id=result['id'],
                neighborhood=result['neighborhood'],
                transit_stop=result['transit_stop'],
                shuttle_stop=result['shuttle_stop']
            )

            # Save the listing so we don't grab it again.
            session.add(listing)
            session.commit()

            # Return the result if it's near a shuttle stop and in a
            # desired neighborhood. Adjust requirements to your liking.
            if (result['shuttle_walk_time'] < settings.MAX_SHUTTLE_WALK_TIME
                    and len(result['neighborhood']) > 0
                    and result['has_image']
                    and desirable(result['url'])):
                results.append(result)

    return results
Exemple #33
0
def scrape_area(area):
	"""
	Scrapes craigslist for certain geographic area, and finds latest listings
	:param area:
	:return: A list of results.
	"""
	cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SECTION,
							filters={"max_price": settings.MAX_PRICE, "min_price": settings.MIN_PRICE})

	results = []
	gen = cl_h.get_results(sort_by="newest", geotagged=True, limit=20)
	while True:
		try:
			result = next(gen)
		except StopIteration:
			break
		except Exception:
			continue 
		listing = session.query(Listing).filter_by(cl_id=result["id"]).first()

		#dont store listing if it already exists
		if listing is None:
			if result["where"] is None:
				#if there is no string identifying which neighborhood result is from skip it
				continue

			lat = 0
			lon = 0
			if result["geotag"] is not None:
				#assign coordinates
				lat = result["geotag"][0]
				lon = result["geotag"][1]

				#annotate result with info about area its in and points of interest near it
				geo_data = find_points_of_interest(result["geotag"], result["where"])
				result.update(geo_data)
			else:
				result["area"] = ""
				result["ttc"] = ""

			#try parsing price
			price = 0
			try:
				price = float(result["price"].replace("$",""))
			except Exception:
				pass


			#create listing object
			listing = Listing(
				link=result["url"],
				created=parse(result["datetime"]),
				lat=lat,
				lon=lon,
				name=result["name"],
				price=price,
				location=result["where"],
				cl_id=result["id"],
				area=result["area"],
				ttc_stop=result["ttc"]
			)

			#save listing so we dont grab it again
			session.add(listing)
			session.commit()

			#return result if its near ttc station or if in area defined
			if len(result["ttc"]) > 0 or len(result["area"]) > 0:
				results.append(result)

		return results
def scrape_area(area):
    """
    Scrapes craigslist for a certain geographic area, and finds the latest listings.
    :param area:
    :return: A list of results.
    """
    cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SECTION,
                             filters={'max_price': settings.MAX_PRICE, "min_price": settings.MIN_PRICE})

    results = []
    gen = cl_h.get_results(sort_by='newest', geotagged=True, limit=20)
    while True:
        try:
            result = next(gen)
        except StopIteration:
            break
        except Exception:
            continue
        listing = session.query(Listing).filter_by(cl_id=result["id"]).first()

        # Don't store the listing if it already exists.
        if listing is None:
            if result["where"] is None:
                # If there is no string identifying which neighborhood the result is from, skip it.
                continue

            lat = 0
            lon = 0
            if result["geotag"] is not None:
                # Assign the coordinates.
                lat = result["geotag"][0]
                lon = result["geotag"][1]

                # Annotate the result with information about the area it's in and points of interest near it.
                geo_data = find_points_of_interest(result["geotag"], result["where"])
                result.update(geo_data)
            else:
                result["area"] = ""
                result["lrt"] = ""

            # Try parsing the price.
            price = 0
            try:
                price = float(result["price"].replace("$", ""))
            except Exception:
                pass

            # Create the listing object.
            listing = Listing(
                link=result["url"],
                created=parse(result["datetime"]),
                lat=lat,
                lon=lon,
                name=result["name"],
                price=price,
                location=result["where"],
                cl_id=result["id"],
                area=result["area"],
                lrt_stop=result["lrt"]
            )

            # Save the listing so we don't grab it again.
            session.add(listing)
            session.commit()

            # Return the result if it's near a lrt station.
            if len(result["lrt"]) > 0:
                results.append(result)

    return results