Beispiel #1
0
def scrape_area(area):
    """
    Scrapes craigslist for a certain geographic area, and finds the latest listings.
    :param area:
    :return: A list of results.
    """
    cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE,
                             area=area,
                             category=settings.CRAIGSLIST_HOUSING_SECTION,
                             filters={
                                 'max_price': settings.MAX_PRICE,
                                 "min_price": settings.MIN_PRICE,
                                 'bedrooms': settings.BEDROOMS,
                                 'min_ft2': settings.MIN_FT2,
                                 "bathrooms": settings.BATHROOMS
                             })

    results = []
    gen = cl_h.get_results(sort_by='newest', geotagged=True, limit=20)
    while True:
        try:
            result = next(gen)
        except StopIteration:
            break
        except Exception:
            continue
        listing = session.query(Listing).filter_by(cl_id=result["id"]).first()

        # Don't store the listing if it already exists.
        if listing is None:
            if result["where"] is None:
                # If there is no string identifying which neighborhood the result is from, skip it.
                continue

            lat = 0
            lon = 0
            if result["geotag"] is not None:
                # Assign the coordinates.
                lat = result["geotag"][0]
                lon = result["geotag"][1]

                # Annotate the result with information about the area it's in and points of interest near it.
                geo_data = find_points_of_interest(result["geotag"],
                                                   result["where"])
                result.update(geo_data)
            else:
                result["area"] = ""
                result["bart"] = ""

            # Try parsing the price.
            price = 0
            try:
                price = float(result["price"].replace("$", ""))
            except Exception:
                pass

            # Create the listing object.
            listing = Listing(link=result["url"],
                              created=parse(result["datetime"]),
                              lat=lat,
                              lon=lon,
                              name=result["name"],
                              price=price,
                              location=result["where"],
                              cl_id=result["id"],
                              area=result["area"],
                              bart_stop=result["bart"])

            # Save the listing so we don't grab it again.
            session.add(listing)
            session.commit()

            # Return the result if it's near a bart station, or if it is in an area we defined.
            if len(result["bart"]) > 0 or len(result["area"]) > 0:
                results.append(result)

    return results
Beispiel #2
0
def scrape_area(area):
	"""
	Scrapes craigslist for certain geographic area, and finds latest listings
	:param area:
	:return: A list of results.
	"""
	cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SECTION,
							filters={"max_price": settings.MAX_PRICE, "min_price": settings.MIN_PRICE})

	results = []
	gen = cl_h.get_results(sort_by="newest", geotagged=True, limit=20)
	while True:
		try:
			result = next(gen)
		except StopIteration:
			break
		except Exception:
			continue 
		listing = session.query(Listing).filter_by(cl_id=result["id"]).first()

		#dont store listing if it already exists
		if listing is None:
			if result["where"] is None:
				#if there is no string identifying which neighborhood result is from skip it
				continue

			lat = 0
			lon = 0
			if result["geotag"] is not None:
				#assign coordinates
				lat = result["geotag"][0]
				lon = result["geotag"][1]

				#annotate result with info about area its in and points of interest near it
				geo_data = find_points_of_interest(result["geotag"], result["where"])
				result.update(geo_data)
			else:
				result["area"] = ""
				result["ttc"] = ""

			#try parsing price
			price = 0
			try:
				price = float(result["price"].replace("$",""))
			except Exception:
				pass


			#create listing object
			listing = Listing(
				link=result["url"],
				created=parse(result["datetime"]),
				lat=lat,
				lon=lon,
				name=result["name"],
				price=price,
				location=result["where"],
				cl_id=result["id"],
				area=result["area"],
				ttc_stop=result["ttc"]
			)

			#save listing so we dont grab it again
			session.add(listing)
			session.commit()

			#return result if its near ttc station or if in area defined
			if len(result["ttc"]) > 0 or len(result["area"]) > 0:
				results.append(result)

		return results
def scrape_area(area):
    """
    Scrapes craigslist for a certain geographic area, and finds the latest listings.
    :param area:
    :return: A list of results.
    """
    cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SECTION,
                             filters={'max_price': settings.MAX_PRICE, "min_price": settings.MIN_PRICE})

    results = []
    gen = cl_h.get_results(sort_by='newest', geotagged=True, limit=20)
    while True:
        try:
            result = next(gen)
        except StopIteration:
            break
        except Exception:
            continue
        listing = session.query(Listing).filter_by(cl_id=result["id"]).first()

        # Don't store the listing if it already exists.
        if listing is None:
            if result["where"] is None:
                # If there is no string identifying which neighborhood the result is from, skip it.
                continue

            lat = 0
            lon = 0
            if result["geotag"] is not None:
                # Assign the coordinates.
                lat = result["geotag"][0]
                lon = result["geotag"][1]

                # Annotate the result with information about the area it's in and points of interest near it.
                geo_data = find_points_of_interest(result["geotag"], result["where"])
                result.update(geo_data)
            else:
                result["area"] = ""
                result["lrt"] = ""

            # Try parsing the price.
            price = 0
            try:
                price = float(result["price"].replace("$", ""))
            except Exception:
                pass

            # Create the listing object.
            listing = Listing(
                link=result["url"],
                created=parse(result["datetime"]),
                lat=lat,
                lon=lon,
                name=result["name"],
                price=price,
                location=result["where"],
                cl_id=result["id"],
                area=result["area"],
                lrt_stop=result["lrt"]
            )

            # Save the listing so we don't grab it again.
            session.add(listing)
            session.commit()

            # Return the result if it's near a lrt station.
            if len(result["lrt"]) > 0:
                results.append(result)

    return results
Beispiel #4
0
def scrape_area(area):
    """
    Scrapes craigslist for a certain geographic area, and finds the latest listings.
    :param area:
    :return: A list of results.
    """
    #  get the google sheet object
    sheet = google_sheets.open_sheet()

    counter = 0

    cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SECTION,
                             filters=settings.FILTERS)


    results = []
    gen = cl_h.get_results(sort_by='newest', geotagged=True, limit=1000)

    while True:
        try:
            result = next(gen)
        except StopIteration:
            break
        except Exception:
            print('exception')
            continue

        listing = session.query(Listing).filter_by(cl_id=result["id"]).first()
        # Don't store the listing if it already exists.

        if listing is None:
            # if result["where"] is None:
            #     # If there is no string identifying which neighborhood the result is from, skip it.
            #     continue

            lat = 0
            lon = 0
            if result["geotag"] is not None:
                # Assign the coordinates.
                lat = result["geotag"][0]
                lon = result["geotag"][1]

                # Annotate the result with information about the area it's in and points of interest near it.
                geo_data = find_points_of_interest(result["geotag"], result["where"])
                result.update(geo_data)
            else:
                result["area"] = ""
                result["google_stop"] = ""
                result["google_dist"] = ""
                result["fb_stop"] = ""
                result["fb_dist"] = ""
                result["fb_walktime"] = "Unknown"
                result["google_walktime"] = "Unknown"
                result["adi_drivetime"] = "Unknown"
                result["address"] = "Unknown"

            # Try parsing the price.
            price = 0
            try:
                price = float(result["price"].replace("$", ""))
            except Exception:
                pass


            #include result if within our area or if there is no location infroamtion
            should_include = False
            if len(result["area"]) > 0 or lat == 0:
                should_include = True

            # Create the listing object.
            listing = Listing(
                link=result["url"],
                created=parse(result["datetime"]),
                lat=lat,
                lon=lon,
                name=result["name"],
                price=price,
                location=result["where"],
                cl_id=result["id"],
                area=result["area"],
                # bart_stop=result["bart"],
                # min_bedrooms=settings.MIN_BEDROOMS,
                # min_bathrooms=settings.MIN_BATHROOMS,
                should_include = should_include,
                # bedrooms = result['bedrooms'],
                # bathrooms = result['bathrooms'],
                # sq_ft = result['sq_ft'],
                # amenities = result['amenities'],
                # available_date = result['available_date'],

            )

            result_to_return = {
                'link': result["url"],
                'created':parse(result["datetime"]),
                'lat':lat,
                'lon':lon,
                'name':result["name"],
                'price':price,
                'location':result["where"],
                'cl_id':result["id"],
                'tagged_location':result["area"],
                # 'bart_stop':result["bart"],
                # 'bart_dist':result["bart_dist"],
                # 'min_bedrooms':settings.MIN_BEDROOMS,
                # 'min_bathrooms':settings.MIN_BATHROOMS,
                'should_include': should_include,
                'available_date': "-",
                'bedrooms': 0,
                'bedrooms': 0,
                'bathrooms': 0,
                'amenities': "-",
                'sq_ft': 0,
                'google_stop': result['google_stop'],
                'google_dist': result['google_dist'],
                'fb_stop': result['fb_stop'],
                'fb_dist': result['fb_dist'],
                'fb_walktime': result['fb_walktime'],
                'google_walktime': result['google_walktime'],
                'adi_drivetime': result['adi_drivetime'],
                'address': result['address']
            }

            print("Adding %s..."%result['name'])
            google_sheets.add_new_record(sheet, result_to_return)

            # print result_to_return

            # Save the listing so we don't grab it again.
            session.add(listing)
            session.commit()

            # Return the result if it's near a bart station, or if it is in an area we defined.
            # if len(result["bart"]) > 0 or len(result["area"]) > 0:
            #     results.append(result)
            if should_include:
                results.append(result_to_return)

        else:
            print("Skipping %s..."%result['name'])

        # counter = counter + 1
        # # get new credentials
        # if counter % 50 == 0:
        #     sheet = google_sheets.open_sheet()

    return results
Beispiel #5
0
def scrape_area(area):
    """
    Scrapes craigslist for a certain geographic area, and finds the latest listings.
    :param area:
    :return: A list of results.
    """
    filters = {'max_price': settings.MAX_PRICE, 'min_price': settings.MIN_PRICE}
    if settings.MIN_BEDROOMS:
        print('filtering by bedrooms:', settings.MIN_BEDROOMS)
        filters['bedrooms'] = settings.MIN_BEDROOMS
    if settings.NEIGHBORHOOD_CODE:
        print('filtering by neighborhood:', settings.NEIGHBORHOOD_CODE)
        filters['neighborhood_code'] = settings.NEIGHBORHOOD_CODE
    
    cl_h = CraigslistHousing(
        site=settings.CRAIGSLIST_SITE,
        area=area,
        category=settings.CRAIGSLIST_HOUSING_SECTION,
        filters=filters)
    
    gmap = GoogleMapsClient()
    
    results = []
    gen = cl_h.get_results(sort_by='newest', geotagged=True, limit=50)
    while True:
        try:
            result = next(gen)
        except StopIteration:
            break
        except Exception:
            continue
        listing = session.query(Listing).filter_by(cl_id=result["id"]).first()

        # Don't store the listing if it already exists.
        if listing is None:
            if result["where"] is None:
                # If there is no string identifying which neighborhood the result is from, skip it.
                continue
                        
            lat = 0
            lon = 0
            if result["geotag"] is not None:
                # Assign the coordinates.
                lat = result["geotag"][0]
                lon = result["geotag"][1]

                # Annotate the result with information about the area it's in and points of interest near it.
                geo_data = find_points_of_interest(result["geotag"], result["where"])
                result.update(geo_data)
                
                # Find walking distances
                loc = {'lat': result["geotag"][0], 'lng': result["geotag"][1]}
                shuttle_min = gmap.min_walking_dist(loc, settings.SHUTTLE_STOPS)
                if shuttle_min['duration']['value'] > settings.MAX_WALKING_TIME:
                    continue
                business_min = gmap.min_walking_dist(loc, settings.BUSINESSES)
                result["bart_dist"] = 'apple: {}, business: {}'.format(
                    gmap.pretty(shuttle_min),
                    gmap.pretty(business_min))
                
            else:
                result["area"] = ""
                result["bart"] = ""
                        
            # Try parsing the price.
            price = 0
            try:
                price = float(result["price"].replace("$", ""))
            except Exception:
                pass

            # Create the listing object.
            listing = Listing(
                link=result["url"],
                created=parse(result["datetime"]),
                lat=lat,
                lon=lon,
                name=result["name"],
                price=price,
                location=result["where"],
                cl_id=result["id"],
                area=result["area"],
                bart_stop=result["bart"]
            )

            # Save the listing so we don't grab it again.
            session.add(listing)
            session.commit()

            # Return the result if it's near a bart station, or if it is in an area we defined.
            if len(result["bart"]) > 0 or len(result["area"]) > 0:
                results.append(result)

    return results
Beispiel #6
0
def scrape(site, area, category, min_price, max_price):
    """
    Scrapes craigslist for a certain geographic area, and finds the latest 
    listings.
    :param site:
    :param area:
    :param category:
    :param min_price:
    :param max_price:
    :return: A list of results.
    """

    results = []

    cl_h = CraigslistHousing(
        site=site,
        area=area,
        category=category,
        filters={'min_price': min_price, 'max_price': max_price}
    )

    gen = cl_h.get_results(
        sort_by='newest',
        geotagged=True,
        limit=20
    )

    while True:
        try:
            result = next(gen)
        except StopIteration:
            break
        except Exception:
            continue

        listing = session.query(Listing).filter_by(cl_id=result['id']).first()

        # Don't store the listing if it already exists.
        if listing is None:
            if result['where'] is None:
                # If there is no string identifying which neighborhood the
                # result is from, skip it.
                continue

            # Annotate the result with information about the area it's in and
            # points of interest near it.
            result.update(
                find_points_of_interest(result['geotag'], result['where'])
            )

            lat = 0
            lon = 0
            if result['geotag'] is not None:
                # Assign the coordinates.
                lat = result['geotag'][0]
                lon = result['geotag'][1]

            # Try parsing the price.
            price = 0
            try:
                price = float(result['price'].replace('$', ''))
            except (TypeError, ValueError):
                pass

            # Create the listing object.
            listing = Listing(
                link=result['url'],
                created=parse(result['datetime']),
                geotag=str(result['geotag']),
                lat=lat,
                lon=lon,
                name=result['name'],
                price=price,
                location=result['where'],
                cl_id=result['id'],
                neighborhood=result['neighborhood'],
                transit_stop=result['transit_stop'],
                shuttle_stop=result['shuttle_stop']
            )

            # Save the listing so we don't grab it again.
            session.add(listing)
            session.commit()

            # Return the result if it's near a shuttle stop and in a
            # desired neighborhood. Adjust requirements to your liking.
            if (result['shuttle_walk_time'] < settings.MAX_SHUTTLE_WALK_TIME
                    and len(result['neighborhood']) > 0
                    and result['has_image']
                    and desirable(result['url'])):
                results.append(result)

    return results
Beispiel #7
0
def scrape_area(area):
    global loggerOn
    """
    Scrapes craigslist for a certain geographic area, and finds the latest listings.
    :param area:
    :return: A list of results.
    """
    cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE,
                             area=area,
                             category=settings.CRAIGSLIST_HOUSING_SECTION,
                             filters={
                                 'max_price': settings.MAX_PRICE,
                                 "min_price": settings.MIN_PRICE
                             })

    # adding some logging to see the URLs
    #cl_h.set_logger(logging.INFO)

    results = []
    gen = cl_h.get_results(sort_by='newest', geotagged=False, limit=35)
    while True:
        try:
            result = next(gen)
        except StopIteration:
            break
        except Exception:
            continue
        listing = session.query(Listing).filter_by(cl_id=result["id"]).first()

        # Don't store the listing if it already exists.
        if listing is None:

            # Here we only iterate over listings that we haven't checked in te past.
            # we get the geotags now instead of getting them before for all listings
            # because it requires an aditional GET which is unnecessary if we aleardy have
            # checked the listing

            if result['has_map']:
                cl_h.geotag_result(result)

            if result["where"] is None:
                # If there is no string identifying which neighborhood the result is from, skip it.
                continue

            lat = 0
            lon = 0
            result["bart_dist"] = 999
            result["walkscore"] = 0
            result["ws_link"] = "Not found!"

            if result["geotag"] is not None:
                # Assign the coordinates.
                lat = result["geotag"][0]
                lon = result["geotag"][1]

                # Annotate the result with information about the area it's in and points of interest near it.
                geo_data = find_points_of_interest(result["geotag"],
                                                   result["where"])
                result.update(geo_data)

                result["walkscore"], result['ws_link'] = get_walk_score(
                    result["geotag"])

            else:
                result["area"] = ""
                result["bart"] = ""

            # Try parsing the price.
            price = 0
            try:
                price = float(result["price"].replace("$", ""))
            except Exception:
                pass

            if result["img_url"] is None:
                result["img_url"] = ""

            # Create the listing object.
            listing = Listing(link=result["url"],
                              created=parse(result["datetime"]),
                              lat=lat,
                              lon=lon,
                              name=result["name"],
                              price=price,
                              location=result["where"],
                              cl_id=result["id"],
                              area=result["area"],
                              bart_stop=result["bart"],
                              walkscore=result["walkscore"],
                              ws_link=result["ws_link"],
                              bart_dist=result["bart_dist"],
                              img_url=result["img_url"])

            # Save the listing so we don't grab it again.
            session.add(listing)
            session.commit()

            # Return the result if it's near a bart station, or if it is in an area we defined.
            if len(result["bart"]) > 0 or len(result["area"]) > 0:
                print result["name"]
                print result["walkscore"]
                print result["ws_link"]
                results.append(result)

    return results
Beispiel #8
0
def scrape_area(area):
    """
    Scrapes craigslist for a certain geographic area, and finds the latest listings.
    :param area:
    :return: A list of results.
    """
    cl_h = CraigslistHousing(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SECTION,
                             filters={'max_price': settings.MAX_PRICE, 'min_price': settings.MIN_PRICE})

    results = []
    gen = cl_h.get_results(sort_by='newest', geotagged=True, limit=30)
    while True:
        try:
            result = next(gen)
        except StopIteration:
            break
        except Exception, e:
            print str(e)
            continue
        #listing = session.query(Listing).filter_by(cl_id=result["id"]).first()
        # Check for listing
        listing = listing_collections.find_one({"id": result["id"]})

        # Don't store the listing if it already exists.
        if listing is None:
            if result["where"] is None:
                # If there is no string identifying which neighborhood the result is from, skip it.
                continue

            # Check for multiple bedrooms
            if result["name"]:
                _name = result["name"].lower()
                match = re.search(r'2 bed.*', _name)
                if not match:
                    continue

            lat = 0
            lon = 0
            if result["geotag"] is not None:
                # Assign the coordinates.
                lat = result["geotag"][0]
                lon = result["geotag"][1]

                # Annotate the result with information about the area it's in and points of interest near it.
                geo_data = find_points_of_interest(result["geotag"], result["where"])
                result.update(geo_data)
            elif result["where"]:
                coords_box = get_coordinates(result["where"])
                geo_data = find_points_of_interest(coords_box, result["where"])
                result.update(geo_data)
            else:
                result["area"] = ""

            # Try parsing the price.
            price = 0
            try:
                price = float(result["price"].replace("$", ""))
            except Exception:
                pass

            listing = {
                "link": result["url"],
                "created": parse(result["datetime"]),
                "lat": lat,
                "lon": lon,
                "name": result["name"],
                "price": price,
                "location": result["where"],
                "cl_id": result["id"],
                "area": result["area"]
            }

            # Insert into the collection...
            _listing_id = listing_collections.insert_one(listing)

            # If area is found return the results
            if len(result["area"]) > 0:
                results.append(result)