def parse_post(post, craigslist_area_name): areas = get_areas() area_timezone = areas[craigslist_area_name]['timezone'] pid = int(post.get('data-pid')) respost_pid = int( post.get('data-repost-of')) if post.get('data-repost-of') else None date_orig = post.cssselect('time')[0].get('datetime') date = arrow.get(date_orig).replace( tzinfo=area_timezone).to('utc').isoformat() url_orig = post.cssselect("p.result-info > a")[0].get('href') url = url_orig if "http" in url_orig else get_url_base( craigslist_area_name) + url_orig url = http_to_https(url) title = post.cssselect("p.result-info > a")[0].text price_el = get_only_first_or_none( post.cssselect("span.result-meta > span.result-price")) price_raw = price_el.text if price_el is not None else None price = int(price_raw.replace("$", "")) if price_raw else None housing_el = get_only_first_or_none( post.cssselect("p.result-info > span > span.housing")) housing = [x.strip() for x in housing_el.text.split("-\n") \ if x.strip()] if housing_el is not None else [] bedrooms_raw = get_only_first_or_none([x for x in housing if "br" in x]) num_bedrooms = int(bedrooms_raw.replace("br", "")) if bedrooms_raw else None area_raw = get_only_first_or_none([x for x in housing if "ft" in x]) area = int(area_raw.replace("ft", "")) if area_raw else None return RegularSearchPost( **{ "id": pid, "title": title, "url": url, "repost_id": respost_pid, "price": price, "bedrooms": num_bedrooms, "date": date, "area": area, })
def process_post_url_output(body): if "<title>craigslist | post not found</title>" in body or '<title>craigslist | Page Not Found</title>' in body: raise CraigslistException("post not found") try: id_ = int(re.search(r'var pID = "(\d+)";', body).groups()[0]) except IndexError: raise CraigslistException("post id not found on page") try: repost_id = re.search(r'var repost_of = (\d+);', body).groups()[0] except AttributeError: repost_id = None doc = lxml.html.fromstring(body) url = http_to_https(doc.cssselect("link[rel=canonical]")[0].get('href')) full_title = " ".join([ x.text_content() for x in doc.cssselect( "h2.postingtitle span.postingtitletext")[0].getchildren()[:-1] ]) short_title = doc.cssselect( "h2.postingtitle span.postingtitletext #titletextonly")[0].text try: # TODO: deal with international prices price = doc.cssselect( "h2.postingtitle span.postingtitletext .price")[0].text.replace( '$', '') except IndexError: price = None try: housing_el = doc.cssselect( "h2.postingtitle span.postingtitletext .housing")[0] except IndexError: housing_el = None if housing_el is not None: try: num_bedrooms, area = parse_housing_el( housing_el.text.replace('/ ', '')) except Exception: num_bedrooms, area = None, None else: num_bedrooms, area = None, None try: hood = doc.cssselect( "h2.postingtitle span.postingtitletext #titletextonly + small" )[0].text.strip().lstrip('(').rstrip(')') except IndexError: hood = None try: address = doc.cssselect("div.mapaddress")[0].text except IndexError: address = None body_el = doc.cssselect("#postingbody")[0] el_to_remove = body_el.cssselect('div.print-qrcode-container')[0] body_el.remove(el_to_remove) body_html = lxml.html.tostring(body_el).decode('utf-8') body_text = body_el.text_content().strip() # doc.cssselect("div.mapAndAttrs p.attrgroup") ???? # [a.get('href') for a in doc.cssselect("#thumbs a")] return DetailPost(id=id_, repost_id=repost_id, url=url, full_title=full_title, short_title=short_title, hood=hood, num_bedrooms=num_bedrooms, sqftage=area, price=price, body_html=body_html, body_text=body_text, address=address)