def empty_listing(*things): parent_name = None for t in things: try: parent_name = t.parent_name break except AttributeError: continue l = Listing(None, None, parent_name=parent_name) l.things = list(things) return Wrapped(l)
def run(self): cl = CraigsListScraper() cl.webpage_html() listings = [] for listing_html in cl.listings_html(): parser = ListingParser(listing_html) listing = Listing(title=parser.title(), housing=parser.housing(), neighborhood=parser.neighborhood(), price=parser.price()) listings.append(listing) return listings
def linkedin_jobs(url): req = requests.get(url, headers={"User-agent": "job_bot 1.0"}) soup = BeautifulSoup(req.content, "html.parser") tags = soup.html.body.findAll("div", class_="result-card__contents job-result-card__contents") # print(len(tags)) for tag in tags: listing = Listing() listing.title = tag.h3.text[:35] + "..." if len(tag.h3.text) > 40 else tag.h3.text listing.company = tag.h4.a.text listing.location = tag.div.span.text listing.date = tag.time.text listing.link = tag.h4.a.get("href") listing.logo = "https://cdn4.iconfinder.com/data/icons/flat-icon-social-media/256/Linkedin.png" jobs.append(listing.to_dict())
def scrape(self, url): html = requests.get(url).text listing_dom = BeautifulSoup(html, 'html.parser') location = self.get_location_name(listing_dom) print(f"Scraping properties in {location}...") listing_urls = self.get_listing_urls(listing_dom) listings = [Listing(url) for url in listing_urls] for listing in listings: listing.scrape_details() return listings
def get_listings(self): """ The get listings function returns an array of Listing objects. :return: Listing object """ if self._county is None: raise Exception("County is required.") if self._area is None: self._area = '' if self._sale_agreed: if self._min_price or self._max_price: self._query_params += self._price + str(QueryParam.SALE_AGREED_WITH_PRICE) else: self._query_params += str(QueryParam.SALE_AGREED) else: if self._min_price or self._max_price: self._query_params += self._price if self._min_price or self._max_price and isinstance(self._listing_type, RentType): self._query_params += str(QueryParam.IGNORED_AGENTS) if self._sort_by: if self._sort_order: self._query_params += str(QueryParam.SORT_ORDER) + str(self._sort_order) self._query_params += str(QueryParam.SORT_BY) + str(self._sort_by) else: self._query_params += str(QueryParam.SORT_ORDER) + str(SortOrder.DESCENDING) self._query_params += str(QueryParam.SORT_BY) + self._sort_by request = Request(verbose=self._verbose, con_conf=self._con_conf) url = self._base + self._county + str(self._listing_type) + str(self._commercial_property_type) + str( self._area) + '?offset=' + str(self._offset) + self._query_params soup = request.get(url) divs = soup.find_all("div", {"class": "box"}) listings = [] [listings.append(Listing(div, self._con_conf)) for div in divs] return listings
def monster_jobs(url): req = requests.get(url, headers={"User-agent": "job_bot 1.0"}) soup = BeautifulSoup(req.content, "html.parser") tags = soup.findAll("div", class_="flex-row") for tag in tags: title = tag.find("h2", class_="title").a.text[:-2] if listing_filter(title): listing = Listing() listing.title = title[:35] + "..." if len(title) > 40 else title listing.company = tag.find("div", class_="company").span.text listing.location = tag.find("div", class_="location").span.text[2:-2] listing.date = tag.find("div", class_="meta flex-col").time.get("datetime")[:-6] listing.link = tag.find("h2", class_="title").a.get("href") listing.logo = "https://games.lol/wp-content/uploads/2018/10/monster-search-best-pc-download-online.png" exists = False for job in jobs: if listing == job: exists = True break if not exists: jobs.append(listing.to_dict())
def update_manifest(self): # First sort new listings by posted date in chronological order new_listings = [] for url in self.new_urls: list_id = url.split('/')[-3] index = url.split('/')[-1] l = Listing(list_id, index) try: new_listings.append([l.posted_time, l]) except AttributeError: logger.warning( "Attribute Error -- listing excluded from manifest.") pass new_listings.sort(key=lambda x: x[0]) # Then loop through listings and add to manifest for _, l in new_listings: # TODO: replace -30 with something better metadata = self.get_listing_metadata(self.manifest[-30:], l) # If new listing is a talk and contains relevant metadata if metadata['is_talk'] and 'start' in metadata: # check if it's a new one OR a corrected listing if (metadata['event_id'] not in self.manifest['event_id'].tolist() ) or metadata['is_correction']: self.push_to_google_calendar(metadata) metadata['pushed_to_cal'] = True self.manifest = self.manifest.append(metadata, ignore_index=True) self._save_manifest() logger.info("Added to manifest: " + l.url)
def main(): a = get_args() filter = None if a.filter: # XXX change to logging print 'search filter present' filter = json.load(open(a.filter)) redfin = RedFin() redfin.use_proxies = False redfin.get_search_results(filter=filter) r_data = redfin.get_one_property_data() h = House(street_address=r_data['street_address'], city=r_data['address_locality'], state=r_data['address_region'], zip_code=r_data['postal_code'], beds=r_data['beds'], baths=r_data['baths'], sq_ft=r_data['sqFt'], lot_size=None, home_type='sfh') l = Listing(house=h)
def indeed_jobs(url): req = requests.get(url, headers={"User-agent": "job_bot 1.0"}) soup = BeautifulSoup(req.content, "html.parser") tags = soup.findAll("div", class_="jobsearch-SerpJobCard") for tag in tags: title = tag.find("h2", class_="title").a.get("title").strip() if listing_filter(title): listing = Listing() listing.title = title[:35] + "..." if len(title) > 40 else title # 75 and 80 listing.company = tag.find("div", class_="sjcl").div.span.text.lstrip() salary = tag.find("span", class_="salaryText") listing.salary = salary.text.lstrip() if salary is not None else "Not Listed" listing.location = tag.find("div", class_="recJobLoc")["data-rc-loc"] # same as .get("data-rc-loc") listing.date = (date.today() - timedelta(days=1)).strftime('%y-%m-%d') listing.link = f"https://www.google.com/search?q={title}+{listing.company}+{listing.location}+{listing.date}+job+opening" listing.logo = "https://is2-ssl.mzstatic.com/image/thumb/Purple118/v4/ab/03/b8/ab03b82b-12cf-ce7c-249f-b54a8f01c1b9/AppIcon-1x_U007emarketing-85-220-0-6.png/246x0w.jpg" exists = False for job in jobs: if listing == job: exists = True break if not exists: jobs.append(listing.to_dict())
def glassdoor_jobs(url): req = requests.get(url, headers={"User-agent": "job_bot 1.0"}) soup = BeautifulSoup(req.content, "html.parser") tags = soup.findAll("li", class_="jl react-job-listing gdGrid") print(len(tags)) for tag in tags: listing = Listing() title = tag.find("a", class_="jobInfoItem jobTitle css-13w0lq6 eigr9kq1 jobLink").span.text listing.title = title[:35] + "..." if len(title) > 40 else title listing.company = tag.find("div", class_="jobHeader d-flex justify-content-between align-items-start").a.span.text salary = tag.find("div", class_="salaryEstimate ") listing.salary = salary.span.span.text if salary is not None else "Not Listed" location = tag.find("div", class_="d-flex flex-wrap css-yytu5e e1rrn5ka1") listing.location = location.span.text if location is not None else "US" listing.date = "24hr" listing.link = "https://www.glassdoor.com" + tag.find("a", class_="jobLink").get("href") listing.logo = "https://www.adweek.com/agencyspy/wp-content/uploads/sites/7/2016/01/glassdoor.jpg" exists = False for job in jobs: if listing == job: exists = True break if not exists: jobs.append(listing.to_dict())
def collect_page_results(self, store): names = self.driver.find_elements_by_xpath( '//a[@name="listpage_productname"]') model_numbers = self.driver.find_elements_by_xpath( '//ul[@class="productInfo"]/li[@class="last"]') item_numbers = self.driver.find_elements_by_xpath( '//ul[@class="productInfo"]/li[not(@class="last")]') prices = self.driver.find_elements_by_xpath( '//p[@class="pricing"]/strong') self.load_next_check = model_numbers[0].text[9:] page_results = [] for i in range(0, len(names)): listing = Listing() listing.name = names[i].text listing.item_number = item_numbers[i].text[8:] listing.model_number = model_numbers[i].text[9:] listing.price = prices[i].text[1:] listing.country = store.country listing.state = store.state listing.town = store.town listing.store_number = store.store_number listing.address = store.address page_results.append(listing) return page_results
def create_listing(self, expiry_time, place): listing = Listing(expiry_time, self.uni, place) self.add_listing(listing)
def write_listing(job_title, job_link, org_name, source, date_posted): new_listing = Listing(job_title, job_link, org_name, source, date_posted) db.session.add(new_listing) db.session.commit()
listings = [] i = 1 while True: params = {"p": i} r = requests.get(REQUEST_URL_PREFIX, params=params) if r.text is None or search(REGEX, r.text) is None: break for match in finditer(REGEX, r.text): domain = match.group(1) bitcoin_price = match.group(2) namecoin_price = match.group(3) litecoin_price = match.group(4) peercoin_price = match.group(5) primecoin_price = match.group(6) prices = { "bitcoin_price": bitcoin_price, "namecoin_price": namecoin_price, "litecoin_price": litecoin_price, "peercoin_price": peercoin_price, "primecoin_price": primecoin_price } listings.append(Listing(domain, prices, datetime.now())) i += 1 sleep(0.5) if not exists(OUTPUT_DIR): makedirs(OUTPUT_DIR) filename = datetime.now().strftime("%Y-%m-%d-%H_%M.pickle") with open(join(OUTPUT_DIR, filename), "wb") as output_file: dump(listings, output_file, protocol=HIGHEST_PROTOCOL)
def fetch_data(self): self.clear_data() still_searching = True search_page = 1 while still_searching: link_text_expired_listings = "https://www.trademe.co.nz/Browse/SearchResults.aspx?sort_order=bids_asc&from=advanced&advanced=true&searchstring=" + self.search_term + "¤t=0&cid=0&rptpath=all&searchregion=100&page=" + str( search_page) try: #make the request and soup exp_res = requests.get(link_text_expired_listings) exp_res.raise_for_status() expired_search_result_soup = bs4.BeautifulSoup( exp_res.text, features="html.parser") except requests.exceptions.HTTPError as err: still_searching = False print( "an HTTP error occured fetching expired listings under the search " + self.search_term) #go through all the listings on this page, checking to see if they have bids raw_listings_this_page = expired_search_result_soup.find_all( "li", class_="listingCard") for listing in raw_listings_this_page: if "Current bid" in listing.text: #the current bid section in the html indicates if bid/s have been placed on the item. #get the link and make a listing object. listing_link = listing.find('a', href=True)['href'] #if the listing link is for property or motors, prevent it being made into a listing object if "/property/" in listing_link or "/motors/" in listing_link or "/farming-forestry/" in listing_link: print("found a bad item, link: " + listing_link) else: this_listing = Listing( "https://www.trademe.co.nz" + listing_link, self.id) self.expired_listings.append(this_listing) else: #stop searching if the script hits listings without bids on the page. still_searching = False #stop searching if there are no more listings. listing_count_text_parts = expired_search_result_soup.find( 'p', class_="listing-count-holder").text.split(" ") if listing_count_text_parts[0] == listing_count_text_parts[-1]: still_searching = False search_page += 1 #--------------------------------------------------------------------------------------------------------------------------------------------------------------------------- #clean the listings that were found #clean out all the listings that: ## are not from the same category as the search term. i = 0 while i < len(self.expired_listings): if self.category.lower( ) in self.expired_listings[i].category.lower(): i += 1 else: del self.expired_listings[i] ## contain too many excluded terms TODO: this doesnt appear to be working properly, it's not important for now, but it will help refine results. while i < len(self.expired_listings): excluded_word_count = len([ ele for ele in self.excluded_terms if (ele in self.expired_listings[i].description.lower()) ]) + len([ ele for ele in self.excluded_terms if (ele in self.expired_listings[i].listingName.lower()) ]) if excluded_word_count > self.max_excluded_terms: print( "listing ID {} contained too many excluded terms. The listing will not be recorded." .format(self.expired_listings[i].id)) del self.expired_listings[i] else: i += 1 print( "finished finding expired listings for the search term '{}', returned {} results" .format(self.search_term, len(self.expired_listings))) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------------- #fetch all the long term data. expired_listing_count = len(self.expired_listings) #find how many current listings there are. link_text = "https://www.trademe.co.nz/Browse/SearchResults.aspx?searchString=" + self.search_term + "&type=Search&searchType=all&user_region=100&user_district=0&generalSearch_keypresses=5&generalSearch_suggested=0&generalSearch_suggestedCategory=" #make the request and soup res = requests.get(link_text) res.raise_for_status() search_result_soup = bs4.BeautifulSoup(res.text, features="html.parser") #get the number of results returned. current_listing_count_str = search_result_soup.find_all( "h3", { "class": "tm-search-header-result-count__heading ng-star-inserted" })[0].get_text() current_listing_count = int(current_listing_count_str.split(" ")[2]) #get the median sale price of sold listings. sold_listings_prices = [] for listing in self.expired_listings: sold_listings_prices.append(listing.get_sell_price()) median_sell_price = statistics.median(sold_listings_prices) if len( self.expired_listings) > 0 else None #finally, make the long term data tuple so that these statistics can be recorded in MySQL. #format: (search_id, date, active_listings, sold_listings, median_sell_price) date = str(datetime.datetime.now()) self.long_term_data = (self.id, date, current_listing_count, expired_listing_count, median_sell_price) self.fetched_data = True
def scrapeSinglePage(): shortWait() allListings = driver.find_elements_by_xpath( '//div[contains(@class,\'maincontent \')]//div[contains(@class,\'propertyitem propertyitem--list\')]' ) longWait() for currentListing in allListings: #need to scroll down with each listing because of the lazy loading of images scrollDown() imageUrl = currentListing.find_element_by_css_selector( 'img').get_attribute('src') shortWait() name = currentListing.find_element_by_class_name( 'propertyitem__address--listview').text shortWait() link = currentListing.find_element_by_class_name( 'propertyitem__link').get_attribute('href') shortWait() price = currentListing.find_element_by_class_name( 'propertyitem__price').text shortWait() price = price.split('\n', 2)[-1] #determine type of listing numberOfAttributes = len( currentListing.find_elements_by_css_selector('th')) shortWait() info = currentListing.find_elements_by_css_selector('td') longWait() if (numberOfAttributes == 1): newListing = Listing(name=name, link=link, price=price, imageUrl=imageUrl, ground=info[0].text) #print(vars(newListing)) container.append(newListing) else: if (numberOfAttributes == 2): newListing = Listing(name=name, link=link, price=price, imageUrl=imageUrl, m2=info[0].text, rooms=info[1].text) container.append(newListing) elif (numberOfAttributes == 7): newListing = Listing(name=name, link=link, price=price, imageUrl=imageUrl, m2=info[0].text, ground=info[1].text, rooms=info[2].text, yearOfConstruction=info[3].text, lengthOfStay=info[4].text, plusMinus=info[5].text, rentAndConsumption=info[6].text) container.append(newListing) elif (numberOfAttributes == 8): newListing = Listing(name=name, link=link, price=price, imageUrl=imageUrl, m2=info[0].text, ground=info[1].text, rooms=info[2].text, yearOfConstruction=info[3].text, lengthOfStay=info[4].text, plusMinus=info[5].text, pricePerM2=info[6].text, ownershipCostPerMonth=info[7].text) container.append(newListing) else: print("error") #uncomment to easily see how pagination works #break shortWait() #TODO: check if the button really exists nextPageElement = driver.find_element_by_xpath( '//ul[contains(@class,\'pagination\')]//li//a[contains(text(),\'Næste\')]' ).click() shortWait()
def find_listings(self): peas = [p for p in self.data.findAll('p', 'row')] listings = [Listing(item) for item in peas] return listings
print('PAGE ' + str(page_number) + '============================================') last_page = page tree = html.fromstring(page.content) for i in range(0, 100): x_pth = post_name_xpth_prefix + str(i) + post_name_xpth_suffix name = tree.xpath(x_pth) # If this element does not exist, continue if len(name) == 0: continue try: lst = Listing(base_url + name[0].attrib['href']) except AttributeError as ae: continue print(lst.get_title()) if lst.get_title() != 'poor_err': if not db.listing_exists(lst): db.save_listing(lst, u_of_t_address) if lst.get_viability( u_of_t_address ) <= 200 and 'Wanted: ' not in lst.get_title(): mail.notify(lst, [ "*****@*****.**", "*****@*****.**" ], u_of_t_address) print('** New listing saved **')
def sell_artwork(self, artwork: Art, price: str): if (self == artwork.owner): listing = Listing(artwork, price, self) return listing
def getResults(self, options={}, withImg=False, limit=30): results = [] page = 1 while len(results) < limit: payload = self.formatQueryParameters(options, page) page += 1 r = requests.get(self.url, params=payload) soup = BeautifulSoup(r.content, 'html.parser') for item in soup.find_all('div', {'class': 'item_row'}): if len(results) >= limit: break id = item.attrs['id'] hasImg = False if item.find_all( 'div', {'class': 'no-image'}) else True if withImg == True and hasImg == False: # We don't want to keep the ad since it hasn't images continue # Retrieve the type of housing, and the approx location category = item.find('span', { 'class': 'category' }).text.strip() location = item.find('span', {'class': 'address'}).text.strip() # Retrieve the name and the link to the add header = item.find('a', {'class': 'item_link'}) name = header.text.strip() link = header.get('href') # Retrieve number of rooms, price and size details = item.find('div', {'class': 'details'}) priceSpan = details.find('span', {'class': 'monthly_rent'}) price = int(priceSpan.text.strip()[:-7].replace( ' ', '')) if priceSpan is not None else -1 sizeSpan = details.find('span', {'class': 'size'}) size = int( float(sizeSpan.text.strip()[:-3].replace(',', '.')) // 1) if sizeSpan is not None else -1 roomsSpan = details.find('span', {'class': 'rooms'}) rooms = float( roomsSpan.text.strip()[:-4].replace(',', '.').replace( '+', '')) if roomsSpan is not None else -1 # Retrive the date when the ad was posted dateTime = item.find('time', {'class': 'jlist_date_image'}) date = dateTime.attrs[ 'datetime'] if dateTime is not None else -1 result = Listing(blocket_id=id, category=category, location=location, name=name, link=link, price=price, size=size, rooms=rooms, date=date) results.append(result) return results
def display_predicted_price(n_clicks, apt, ec, condo, time, radius, postal_input, property_type, floor_num, floor_area, lease): if n_clicks: ##### Current Global Listing Object ##### global curr_listing curr_listing = Listing(postal_input, property_type, int(floor_num), float(floor_area), int(lease)) global price_output, price_psm_output price_output, price_psm_output = curr_listing.pred_price( "modelling/", cols, postal_code_area, area_df, sch, train, police_centre, avg_cases) # For testing #curr_listing = Listing('597592', 'Condominium', 6, 99, 70) #curr_listing = Listing('689527', 'Condominium', 6, 99, 70) ##### Parameters of Sample Object ##### time_param = [0, 0] if (time == 'Past 5 Years'): time_param[0] = 1 elif (time == 'Past 10 Years'): time_param[1] = 1 radius_param = [0, 0] if (radius == 'Within 1km'): radius_param[0] = 1 elif (radius == 'Within 2km'): radius_param[1] = 1 ec_param, condo_param, apt_param = 0, 0, 0 # Setting default property_filter to property_type of listing if ((not apt) and (not condo) and (not ec)): if (property_type == 'Condominium'): condo_param = 1 elif (property_type == 'Apartment'): apt_param = 1 elif (property_type == 'Executive Condominium'): ec_param = 1 else: if ec: ec_param = 1 if condo: condo_param = 1 if apt: apt_param = 1 ##### Current Global Sample Object ##### global curr_sample params = { 'radius': radius_param, 'property': [ec_param, condo_param, apt_param], 'time': time_param } curr_sample = Sample(params, prelim_ds) curr_sample.get_filtered_df(prelim_ds, curr_listing.get_lon(postal_code_area), curr_listing.get_lat(postal_code_area)) curr_sample.get_map(curr_listing.get_lon(postal_code_area), curr_listing.get_lat(postal_code_area), price_psm_output, curr_listing.get_building(), curr_listing.get_road_name(), 100) map_component = html.Iframe(srcDoc=open('sample_map.html', 'r').read(), height='600') transaction_table = curr_sample.get_transaction_table() psm_timeseries_plot = html.Div([ html.Div([ 'Aggregated resale market conditions for ', html.B( curr_listing.get_planning_area(postal_code_area, area_df).title()), " planning area together with its 2 closest neighbours in the past " + str(curr_sample.get_time()) + ' years' ], style={'font-size': 'medium'}), html.Div( 'Only resale transactions of ' + ", ".join([ property + "s" for property in curr_sample.get_property() ]) + " within each planning area are included within the computation", style={'font-size': 'medium'}), curr_sample.plot_psm( prelim_ds, area_df, curr_listing.get_planning_area(postal_code_area, area_df), 2), ]) return [ overview_section(curr_listing, price_output, price_psm_output), curr_listing.get_planning_area(postal_code_area, area_df).title(), transaction_features(curr_sample), map_component, transaction_table, psm_timeseries_plot, [ 'All resale transactions of ' + ", ".join([ property + "s" for property in curr_sample.get_property() ]) + " in the past ", html.B(str(curr_sample.get_time()) + " years"), " that are within a radius of ", html.B(str(curr_sample.get_radius()) + "km"), " from your property" ] ] #### Default output # Map map_component = html.Iframe(srcDoc=open('assets/default_map.html', 'r').read(), height='600') # Timeseries filtered_df = prelim_ds.copy() filtered_df['Sale Month'] = filtered_df['Sale Date'].apply( lambda x: x.strftime('%Y-%m')) # to plot based on Year and Month filtered_df['Sale Year'] = filtered_df['Sale Date'].apply( lambda x: x.year) # to plot based on Year grp_df = filtered_df.groupby(['Sale Month', 'Planning Area']).mean().reset_index() fig = px.line( grp_df, x="Sale Month", y="PPI", #color='Planning Area', labels={ "Sale Month": "Year", "PPI": "Property Price Index" }) fig.update_layout(plot_bgcolor='#f8f4f0') # To control white space surrounding the plot fig.update_layout(margin={'t': 15, 'b': 20, 'l': 20, 'r': 30}) fig.update_layout(height=450) ts_plot = dcc.Graph(figure=fig) # Transaction Table df = prelim_ds[[ 'Sale Date', 'Address', 'Floor Number', 'Area (SQFT)', 'Remaining Lease', 'Unit Price ($ PSF)' ]].copy() df = df.rename(columns={ 'Area (SQFT)': 'Floor Area', 'BUILDING': 'Building Name' }) df = df.sort_values(by=['Sale Date'], ascending=False).head(100) df['Sale Date'] = df['Sale Date'].apply(lambda x: x.date()) table = dash_table.DataTable( data=df.to_dict('records'), columns=[{ 'id': c, 'name': c } for c in df.columns], # Remove Pagination page_action='none', #For sorting by columns sort_action="native", # For filtering rows by column values filter_action="native", #style_as_list_view=True, style_table={ 'max-height': '400px', 'font-size': '13px' }, style_cell={ 'textAlign': 'center', 'font-family': 'sans-serif', 'width': '{}%'.format(len(df.columns)) #'minWidth': '20px', 'width': '20px', 'maxWidth': '200px' }, #Controilling width of columns style_cell_conditional=[ { 'if': { 'column_id': 'Sale Date' }, 'width': '5%' }, { 'if': { 'column_id': 'Address' }, 'width': '5.5%' }, ], style_data={'padding-left': 7}, #striped rows style_data_conditional=[{ 'if': { 'row_index': 'even' }, 'backgroundColor': '#f2f2ed' #'lightgrey' }], #Fixed row for when scrolling vertically fixed_rows={'headers': True}, style_header={ 'backgroundColor': 'rgb(255, 255, 255)', 'fontWeight': 'bold', 'padding-left': 7 }, ) transaction_table = html.Div([ html.Div('Past 100 Recent Transactions', style={ 'padding-bottom': 2, 'font-size': 'xx-large' }), table ]) return [ "", 'Island Wide', transaction_features(full_sample), map_component, transaction_table, ts_plot, "Showing all resale transactions of Apartments, Condominiums, Executive Condominiums within the past 10 years" ]