class ListingDBTest(unittest.TestCase): def setUp(self): mockConn = MagicMock() self.mockCursor = MagicMock() mockConn.cursor.return_value = self.mockCursor self.db = ListingDB(mockConn) def testInit(self): self.mockCursor.execute.assert_called_with("CREATE TABLE IF NOT EXISTS listings(pid TEXT PRIMARY KEY, availableDate TEXT)") def testInsertListing(self): values = ("9999999", "2015-07-15") self.db.insert(values[0], values[1]) self.mockCursor.execute.assert_called_with("INSERT INTO listings VALUES (?, ?)", values) def testHasListing(self): pid = "123" hasListing = self.db.has(pid) self.mockCursor.execute.assert_called_with("SELECT * FROM listings WHERE pid = ?", (pid,))
def __init__(self): self.goSlow = False self.db = ListingDB()
class BlueRidge: def __init__(self): self.goSlow = False self.db = ListingDB() def setDB(self, DB): self.db = DB def initSession(self): self.session = requests.session() headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip,deflate,sdch', 'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36', 'Connection':'keep-alive'} self.session.headers.update(headers) def parse(self, listingsHTML): self.soup = BeautifulSoup(listingsHTML) self.rows = self.soup.findAll('p', class_='row') self.prices = self.getPrices() self.titles = self.getTitles() def getCount(self): return len(self.rows) def getPrices(self): prices = [] for row in self.rows: price_span = row.find('span', class_='price') if price_span: prices.append(int(price_span.string.replace('$',''))) else: prices.append(-1) return prices def getTitles(self): return [row.find('a', class_='hdrlnk').string for row in self.rows] def getListOfPids(self): return [row['data-pid'] for row in self.rows] def getListingsLessThan(self, maxPrice): indices = [] for index, price in enumerate(self.prices): if price is not None and price <= maxPrice: indices.append(index) listings = [] for index in indices: row = self.rows[index] pid = row['data-pid'] if not self.db.has(pid): listings.append((pid, self.titles[index], self.prices[index])) self.db.insert(pid, None) return listings def generateLink(self, pid): return "http://sfbay.craigslist.org/sfc/fuo/" + pid + ".html" def requestPage(self, url): try: return self.session.get(url).text except AttributeError: self.initSession() return self.session.get(url).text def getLinks(self, pids): return [self.generateLink(pid) for pid in pids] def getAnchorLinksFromPids(self, listings): anchorLinks = "" for listing in listings: if listing[2] == -1: price = '' else: price = '- ${}'.format(listing[2]) anchorLink = "<a href=\"{}\">{} {}</a><br />".format(self.generateLink(listing[0]).encode('utf-8'), listing[1].encode('utf-8'), price) anchorLinks += anchorLink return anchorLinks
class BlueRidge: def __init__(self): self.goSlow = False self.db = ListingDB() def setDB(self, DB): self.db = DB def initSession(self): self.session = requests.session() headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip,deflate,sdch', 'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36', 'Connection':'keep-alive'} self.session.headers.update(headers) def parse(self, listingsHTML): self.soup = BeautifulSoup(listingsHTML) self.rows = self.soup.findAll('p', class_='row') self.prices = self.getPrices() self.bedrooms = self.getBedrooms() self.pricesPer = self.getPricesPerBedroom() def getCount(self): return len(self.rows) def getPrices(self): prices = [int(row.find('span', class_='price').string.replace('$','')) for row in self.rows] return prices def getBedrooms(self): bedrooms = [] for row in self.rows: brSpan = row.find('span', class_='housing') if brSpan is None: bedrooms.append(None) else: count = int(re.search("(\d+)br", str(brSpan.text)).group(1)) bedrooms.append(count) return bedrooms def getPricesPerBedroom(self): pricesPerBR = [] for i in range(0, len(self.prices)): if self.bedrooms[i] is None: pricesPerBR.append(None) else: pricesPerBR.append(int(self.prices[i])/int(self.bedrooms[i])) return pricesPerBR def getListOfPids(self): return [row['data-pid'] for row in self.rows] def getListingsLessThanPerBR(self, maxPrice): indices = [] for index, pricePer in enumerate(self.pricesPer): if pricePer is not None and pricePer <= maxPrice: indices.append(index) pids = [] for index in indices: row = self.rows[index] pid = row['data-pid'] #if not self.db.has(pid): pids.append(pid) # self.db.insert(pid, None) return pids def getAvailableDate(self, pid): listingHTML = self.requestPage(self.generateLink(pid)) soup = BeautifulSoup(listingHTML) date = soup.find('span', class_="housing_movein_now property_date")['date'] return date def getCountAvailableAfter(self, date): return len(self.getListingsAvailableAfter(date)) def getListingsAvailableAfter(self, date): listings = [] pids = self.getListOfPids() for pid in pids: if not self.db.has(pid): if self.goSlow: time.sleep(random.randint(1,3)) listingDate = self.getAvailableDate(pid) if listingDate >= date and listingDate <= '2016-01-01': listings.append(pid) self.db.insert(pid, listingDate) return listings def getListingsAvailableAfterAndLessThan(self, date, maxPrice): #TODO: Fix because both methods are now inserting into the database pids = self.getListOfPids() listingsSet = set(self.getListingsLessThanPerBR(maxPrice)) & set(self.getListingsAvailableAfter(date)) listings = [listing for listing in pids if listing in listingsSet] return listings def generateLink(self, pid): return "http://sfbay.craigslist.org/sfc/apa/" + pid + ".html" def requestPage(self, url): try: return self.session.get(url).text except AttributeError: self.initSession() return self.session.get(url).text def getLinks(self, pids): return [self.generateLink(pid) for pid in pids] def getAnchorLinksFromPids(self, pids): links = self.getLinks(pids) anchorLinks = "" for link in links: anchorLink = "<a href=\"%s\">%s</a><br />" % (link, link) anchorLinks += anchorLink return anchorLinks
def setUp(self): mockConn = MagicMock() self.mockCursor = MagicMock() mockConn.cursor.return_value = self.mockCursor self.db = ListingDB(mockConn)