def query_craig(self): if self.verbose: print('# query_craig()' ) if self.veryverbose: print( self.info['filters'] ) cl_a = CraigslistForSale( site=self.info['site'], area=self.info['area'], category=self.info['category'], filters=self.info['filters'] ) limit = 0 for result in cl_a.get_results(sort_by='newest'): record_time = car_util.time_object( result['datetime'],'%Y-%m-%d %H:%M' ) clean_time = car_util.time_object( self.info['since-date'],'%Y-%m-%dT%H:%M:%SZ' ) if record_time > clean_time: xCar = car_info.car( {'url': result['url']} ).update_info() # print('x' + str(xCar.info) ) while xCar.error != '': print('ErroR ' + str(xCar.error) + result['url']) if xCar.error == 408: # Request Timeout print( ' time problem, wait 3 and try again ' ) xCar = car_info.car( {'url': result['url']} ).update_info() time.sleep(3) if car.error == 404: # Request not found print( ' not found!! ') xCar.info['sold-date'] = datetime.datetime.utcnow() xCar.save_me() xCar.post_solr() break time.sleep(1) limit += 1
def scrape_area(area): """ Scrapes craigslist for a certain geographic area, and finds the latest listings. :param area: :return: A list of results. """ cl_fs = CraigslistForSale(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_FORSALE_SECTION, filters = {'make': 'triumph'}) results = [] gen = cl_fs.get_results(sort_by='newest', limit=150) while True: try: result = next(gen) except StopIteration: break except Exception: continue listing = session.query(Listing).filter_by(cl_id=result["id"]).first() # Don't store the listing if it already exists. if listing is None: # Try parsing the price. price = 0 try: price = float(result["price"].replace("$", "")) except Exception: pass # Create the listing object. listing = Listing( link=result["url"], created=parse(result["datetime"]), ## lat=lat, ## lon=lon, name=result["name"], price=price, ## location=result["where"], cl_id=result["id"], ## area=result["area"], ## bart_stop=result["bart"] ) # Save the listing so we don't grab it again. session.add(listing) session.commit() if len(result["name"]) > 0: results.append(result) return results
from craigslist import CraigslistForSale cl_h = CraigslistForSale(site='denver', filters={'query': 'wurlitzer 200'}) for result in cl_h.get_results(sort_by='newest', geotagged=True): print(result)
class Shark: def __init__(self, query=None): prev = os.path.dirname(os.getcwd()) db = os.path.join(prev, 'database', 'craigslist_results.db') self.conn = self.connect_db(db) if query is not None: self.craig = CraigslistForSale(site='sandiego', filters={'query' : query}) # Fill db with queried items now self.sql_init(query) def close_db(self): ''' Closes the connection to the DB :return: ''' try: self.conn.close() except Error as e: print(e) def sql_init(self, query): ''' Initializes all the sql functions to their initial state :return: ''' result_set = self.get_query(limit=50) for result in result_set: self.insert_db(result, query) data = self.select_price_from_db() # print(data) outliers = self.filter_data(data) self.remove_filtered_from_db(outliers) def connect_db(self, db_file): ''' Make a connection to our DB :param db_file: :return: conn object or None ''' try: conn = sqlite3.connect(db_file) x = conn.execute('pragma journal_mode=wal') return conn except Error as e: print(e) return None def insert_db(self, item, query): ''' Updates DB and inserts new items into it :param item: Item to be inserted :param query: User input, will be hashed into a query ID for distinction between queries :return: ''' id = int(item['id']) name = item['name'] url = item['url'] time = item['datetime'] price = int(item['price'][1:]) q_id = str(hash(query)) insert_stmt = 'INSERT or IGNORE INTO computers (id, name, url, time, price, query_id) ' \ 'VALUES (?, ?, ?, ?, ?, ?)' entry = (id, name, url, time, price, q_id) try: c = self.conn.cursor() with self.conn: c.execute(insert_stmt, entry) except Error as e: print(e) def remove_filtered_from_db(self, outliers): ''' Removes all outliers from the DB :param outliers: ''' cur = self.conn.cursor() for item in outliers: cur.execute('DELETE FROM computers WHERE price = ?', item) def select_all_from_db(self): ''' Fetches all items from db :param conn: :return: rows -> all rows from db ''' cur = self.conn.cursor() cur.execute('SELECT * FROM computers') rows = cur.fetchall() return rows def select_price_from_db(self): cur = self.conn.cursor() cur.execute('SELECT price FROM computers') rows = cur.fetchall() return rows def price_with_query(self, query): h = str(hash(query)) cur = self.conn.cursor() cur.execute('SELECT price FROM computers WHERE query_id = ?', (query,)) rows = cur.fetchall() return rows def select_by_hash_from_db(self, item): h = hash(item) cur = self.conn.cursor() cur.execute('SELECT * FROM computers WHERE query_id = ?', (str(h),)) rows = cur.fetchall() return rows def get_query(self, limit=0, year=None): ''' Gets results back from web search :param limit: :param year: :return: ''' results = [] for result in self.craig.get_results(limit=limit): if year is not None: if year in result['name']: results.append(result) else: results.append(result) return results def filter_data(self, data): ''' Filters out values that are too low to be valid electronics :param data: list of data to filter :return outliers: ''' # TODO Fix so it actually filters data with a better algorithm mean = np.mean(data) std = np.std(data) outliers = [] for item in data: if item < (mean - std): outliers.append(item) return outliers
'purple', 'schwinn', 'bike', 'bicycle', 'hybrid', 'Bike', 'Bicycle', 'Schwinn', 'Purple', 'womens', 'ladies' ] cl_e = CraigslistForSale(site='sacramento', filters={ 'search_titles': True, 'query': 'bike', 'has_image': True }) print( '/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n' ) print('Results:\n') for result in cl_e.get_results(sort_by='newest', limit=3000): if result['where'] == ('Davis' or 'Davis, CA' or 'davis'): for search in query: if search in result['name']: print('Location: {}\n'.format(result['where'])) print('Date: {}\n'.format(result['datetime'])) print('Post Title: {}\n'.format(result['name'])) print('URL: {}\n'.format(result['url'])) print('Price: {}\n'.format(result['price'])) print('Has an Image Available (True or False): {}\n'.format( result['has_image'])) if query[0] and query[1] in result['name']: print('Red Alert') badmessage = twilioCli.messages.create( body='Suspicious Sale posted', from_=myTwilioNumber,
#hour = int(time.strftime('%H', time.localtime(time.time()))) #day = int(time.strftime('%d', time.localtime(time.time()))) #when = "" #os.system(stri) #os.system("osascript -e \'tell application \"Safari\" to activate\'") i = 1 added = 0 sofar = "" with open('cars.html', 'r') as myfile: data = myfile.read().replace('\n', '') #took out geotagged=True for result in search.get_results(sort_by='price_asc'): #,limit = 50 if ((data.find(result['name'].encode('utf-8')) == -1) and (sofar.find(result['name'].lower()) == -1)): with open("cars.html", "a") as myfile: #append the listing myfile.write("<a href = \"" + result['url'].encode('utf-8') + "\">" + str(i) + ": " + result['price'].encode('utf-8') + " " + result['name'].encode('utf-8') + " " + result['url'].encode('utf-8') + "\n" + "</a><br><br>") added = added + 1 sofar += result['name'].lower() i += 1 print "new listing \"" + result['name'] + "\" added." #if day - int(result['datetime'][8:10]) == 0: # when = str(hour - int(result['datetime'][11:13])) #else:
strj = "\"})\n\t end tell \nend tell\'" hour = int(time.strftime('%H', time.localtime(time.time()))) day = int(time.strftime('%d', time.localtime(time.time()))) when = "" #os.system(stri) #os.system("osascript -e \'tell application \"Safari\" to activate\'") i = 1 with open('porsche.html', 'r') as myfile: data = myfile.read().replace('\n', '') #took out geotagged=True for result in search.get_results(sort_by='newest'): #,limit = 50 if (data.find(result['name'].encode('utf-8')) == -1): with open("porsche.html", "a") as myfile: myfile.write("<a href = \"" + result['url'].encode('utf-8') + "\">" + str(i) + ": " + result['price'].encode('utf-8') + " " + result['name'].encode('utf-8') + " " + result['url'].encode('utf-8') + "\n" + "</a><br><br>") print "new listing " + result['name'] + " added." if day - int(result['datetime'][8:10]) == 0: when = str(hour - int(result['datetime'][11:13])) else: when = str(hour + (24 * abs(day - int(result['datetime'][8:10]))) - int(result['datetime'][11:13])) print str(i) + ": " + result['price'] + ' ' + result[ 'name'][:30] + ": " + result['url']
def scrape_for_sale(): """ Searching target object in craiglist for sale """ cl_h = CraigslistForSale(site=settings.CRAIGSLIST_SITE, category=settings.CRAIGSLIST_CATEGORY, filters={'query': settings.CRAIGSLIST_FORSALE_SECTION, 'search_titles': True, 'max_price': settings.MAX_PRICE}) gen = cl_h.get_results(sort_by='newest', geotagged=True) results = [] while True: try: result = next(gen) except StopIteration: break except Exception: continue # Search the result in database, if it is already there, skip it. listing = session.query(Listing).filter_by(cl_id=result["id"]).first() if listing is None: geotag = result["geotag"] area_found = False area = "" lat = 0 lon = 0 if result["geotag"] is not None: lat = geotag[1] lon = geotag[0] coords = (lat, lon) for a, box in settings.BOXES.items(): if in_box(coords, box): area = a area_found = True result["area"] = area # Try parsing the price. price = 0 try: price = float(result["price"].replace("$", "")) except Exception: pass # Create the listing object. listing = Listing( link=result["url"], created=parse(result["datetime"]), lat=lat, lon=lon, name=result["name"], price=price, location=result["where"], cl_id=result["id"], area=result["area"], ) # Save the listing so we don't grab it again. session.add(listing) session.commit() # if the location can be found in the box, append it into the results if area_found: results.append(result) return results
#run the search on the specific site cl_fs_car = CraigslistForSale( site=j['site'], category=cat_use, filters={ 'query': query_use, 'has_image': True, 'search_titles': True #, 'auto_transmission':'manual' , 'auto_fuel_type': ['gas', 'hybrid', 'electric', 'other'], 'max_year': '2000', 'auto_title_status': ['clean', 'salvage', 'rebuilt', 'lien'] }) for result in cl_fs_car.get_results(): try: date_time_use = convert( result['datetime'] ) #convert the string time from the ad to datetime format logging.info('found result!') if ( date_time_use > datetime_limit ): #check to see if posting time was within the last 24 hours from when the code was run the_image = get_image_from_page( result['url'] ) #returns the first image on the ad, shouldn't error out as we filter to ads with images logging.info(the_image) #set the caption caption = result['name'] + '\nListing Price: ' + result[
def main(argv): if len(argv) < 4: print('please enter $python clvehicles.py [city] [make] [model]') sys.exit() location = argv[1] make = argv[2] model = argv[3] print('Searching the ' + location + ' craigslist site for ' + make + ' ' + model) cl_s = CraigslistForSale(site=location, filters={ 'make': make, 'model': model, 'min_price': 2000 }) i = 0 urls = [] nresults = sum(1 for x in cl_s.get_results()) vehicles = [] for result in cl_s.get_results(): #print result urls.append(result['url']) veh = Vehicle(result['name']) veh.setPrice(result['price']) veh.setTimestamp(result['datetime']) veh.setURL(result['url']) veh.setID(result['id']) vehicles.append(veh) i = i + 1 print i print('Parsing the ads') for k in range(len(vehicles)): rsp = requests.get(urls[k]) html = bs4(rsp.text, 'html.parser') vehresults = html.body.find_all('p', attrs={'class': 'attrgroup'}) #find a way of turning this into a dictionary try: vyear = vehresults[0].find_all('span')[0].get_text()[0:4] vehicles[k].setYear(vyear) vehicle_info = vehresults[1].find_all('span') for l in range(len(vehicle_info)): attribute = vehicle_info[l].get_text().split(':') if attribute[0] == 'condition': vehicles[k].setCondition(attribute[1]) elif attribute[0] == 'cylinders': vehicles[k].setCylinders(attribute[1]) elif attribute[0] == 'drive': vehicles[k].setDrive(attribute[1]) elif attribute[0] == 'fuel': vehicles[k].setFuel(attribute[1]) elif attribute[0] == 'odometer': vehicles[k].setMilage(attribute[1]) elif attribute[0] == 'paint color': vehicles[k].setColor(attribute[1]) elif attribute[0] == 'title status': vehicles[k].setTitleStatus(attribute[1]) elif attribute[0] == 'transmission': vehicles[k].setTranstype(attribute[1]) elif attribute[0] == 'type': vehicles[k].setVehicletype(attribute[1]) except IndexError: print('Post %d was likely deleted', k) headers = [ 'name', 'price', 'year', 'condition', 'milage', 'title status', 'transmission', 'drive', 'cylinders', 'fuel', 'color', 'location', 'timestamp', 'url' ] print('writing to .csv') fname = location + model + '.csv' with open(fname, 'wb') as f: w = csv.writer(f) i = 0 w.writerow(headers) for i in range(len(vehicles)): try: name = vehicles[i].name except AttributeError: name = 'N/A' try: price = vehicles[i].price except AttributeError: price = 'N/A' try: condition = vehicles[i].condition except AttributeError: condition = 'N/A' try: milage = vehicles[i].milage except AttributeError: milage = 'N/A' try: titleStatus = vehicles[i].titleStatus except AttributeError: titleStatus = 'N/A' try: transtype = vehicles[i].transtype except AttributeError: transtype = 'N/A' try: drive = vehicles[i].drive except AttributeError: drive = 'N/A' try: cylinders = vehicles[i].cylinders except AttributeError: cylinders = 'N/A' try: fuel = vehicles[i].fuel except AttributeError: fuel = 'N/A' try: color = vehicles[i].color except AttributeError: color = 'N/A' try: timestamp = vehicles[i].timestamp except AttributeError: timestamp = 'N/A' try: url = vehicles[i].url except AttributeError: url = 'N/A' try: year = vehicles[i].year except AttributeError: year = 'N/A' row = [ name, price, year, condition, milage, titleStatus, transtype, drive, cylinders, fuel, color, location, timestamp, url ] try: w.writerow(row) except UnicodeEncodeError: print("weird character")
def scrape_area(area): """ Scrapes craigslist for a certain geographic area, and finds the latest listings. :param area: :return: A list of results. """ cl_h = CraigslistForSale(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_HOUSING_SECTION, filters={ 'max_price': settings.MAX_PRICE, "min_price": settings.MIN_PRICE }) results = [] gen = cl_h.get_results(sort_by='newest', geotagged=True, limit=20) while True: try: result = next(gen) except StopIteration: break except Exception: continue listing = session.query(Listing).filter_by(cl_id=result["id"]).first() # Don't store the listing if it already exists. if listing is None: # if result["where"] is None: # # If there is no string identifying which neighborhood the result is from, skip it. # continue lat = 0 lon = 0 if result["where"] is not None and result["geotag"] is not None: # Assign the coordinates. lat = result["geotag"][0] lon = result["geotag"][1] # Annotate the result with information about the area it's in and points of interest near it. geo_data = find_points_of_interest(result["geotag"], result["where"]) result.update(geo_data) else: result["area"] = "" result["bart"] = "" # Try parsing the price. price = 0 try: price = float(result["price"].replace("$", "")) except Exception: pass # Create the listing object. listing = Listing(link=result["url"], created=parse(result["datetime"]), lat=lat, lon=lon, name=result["name"], price=price, location=result["where"], cl_id=result["id"], area=result["area"], bart_stop=result["bart"]) # Save the listing so we don't grab it again. session.add(listing) session.commit() # Return the result if it's near a bart station, or if it is in an area we defined. #if len(result["bart"]) > 0 or len(result["area"]) > 0: results.append(result) return results
import redis import time r = redis.Redis() #Auth credentials for twilio account_sid = '' auth_token = '' client = Client(account_sid, auth_token) while True: cl_auto = CraigslistForSale(site='sandiego', category='cto', filters={'query': 'frontier'}) for result in cl_auto.get_results(sort_by='newest'): id = result['id'] name = result['name'] url = result['url'] if r.exists(id): continue print(result) r.set(id, 'true') # send the text message message = client.messages.create(body=name + ' ' + url, from_='+16199999999', to='+16199999999') time.sleep(60 * 10)
import props geolocator = Nominatim() orig_coord = (0, 0) # initializing the originating location def init_orig(): orig_loc = geolocator.geocode(props.ORIG_LOC) global orig_coord orig_coord = (orig_loc.latitude, orig_loc.longitude) # calculates the distance in miles between two locations def get_distance(orig_coord, dest): dest_loc = geolocator.geocode(dest) dest_coord = (dest_loc.latitude, dest_loc.longitude) return round(vincenty(orig_coord, dest_coord).miles, 1) # display formatted results def display_results(result): dest = result['where'] + ", " + props.ORIG_STATE print result['name'] + " " + result['price'] + " " + result['where'] print "distance: " + str(get_distance(orig_coord, dest)) + " miles\n" init_orig() cl_fs = CraigslistForSale(site=props.CRAIG_SITE, category=props.CRAIG_CATEGORY, filters={'max_price': props.CRAIG_PRICE, 'has_image': props.CRAIG_IMAGE}) for result in cl_fs.get_results(sort_by=props.CRAIG_SORTBY): if props.CRAIG_SITE in result['url']: display_results(result)
saved_ids = set(historical.keys()) found_posts = {} for query in queries['queries']: sites = query['sites'] category = query['category'] filters = query['filters'] # Crawl the craigslist for site in sites: CL_query = CraigslistForSale(site=site, category=category, filters=filters) found_posts.update( {result['id']: result for result in CL_query.get_results()}) # fun set logic post_ids = set(found_posts.keys()) old_ids = saved_ids & post_ids # posts which no longer appear in searches shouldn't be notified new_ids = post_ids - old_ids postings_to_notify = {post_id: found_posts[post_id] for post_id in new_ids} postings_to_remind = {post_id: found_posts[post_id] for post_id in old_ids} # Send an email if postings_to_notify: receiver_email = queries['email'] sender_email = config['sender_email'] sender_password = config[ 'sender_password'] # should really switch to google 2fa instead of plaintext passwords lol