예제 #1
0
def test_posting(data_to_resp):
    url = "https://vancouver.craigslist.org/rds/reb/d/surrey-hottest-deal-of-the-week-5-bed-4/7147938227.html"
    path = "posting_realty_20200624.html"
    listing_type = "apa"
    response = data_to_resp(url, path, listing_type="apa")
    now = datetime.utcnow()
    with patch("clscraper.spiders.posting.datetime") as mock_datetime:
        mock_datetime.utcnow.return_value = now
        spider = PostingSpider()
        item = next(spider.parse(response))
        PostgresPipeline().process_item(item, spider)
        PostgresPipeline().process_item(item, spider)
        PostgresPipeline().process_item(item, spider)
        PostgresPipeline().process_item(item, spider)
        with session_scope() as session:
            assert max(session.query(PostingRevision.id))[0] == 4
예제 #2
0
def test_posting_revision():
    from datetime import datetime
    from clscraper.models import Posting, PostingRevision
    with session_scope() as session:
        kwargs = dict(
            title="Testing",
            url="Testing",
            partial_scrape=True,
            datetime_scraped=datetime.utcnow(),
            listing_type="apa",
        )
        posting = Posting(id=1, **kwargs)
        one = PostingRevision(id=1, posting_id=posting.id, **kwargs)
        two = PostingRevision(id=one.id + 1, posting_id=posting.id, **kwargs)
        session.add(posting)
        session.flush()
        session.add(one)
        session.add(two)
        session.flush()
        assert one.id == 1
        assert two.id == 2
예제 #3
0
    def parse(self, response: HtmlResponse):
        """Parse function that scrapes HousingListing items from craigslist list pages.

        Gets inserted into the db with partial_scrape=True. Then the posting spider will pull that list of paritial
        scrapes and fill out the rest of the posting.
        """
        listings = []
        
        # go through each listing row and build a list of listing items
        for result in response.css(".result-row"):

            #get num bedrooms and floor area from housing div
            rooms = None
            floor_area = None
            floor_area_units = None
            housing = result.css(".result-meta .housing::text").get()
            if housing:
                housing = [val.strip() for val in housing.split("-")]
                for value in housing:
                    if re.match(r"[0-9]+br", value):
                        rooms = int(value.replace("br", ""))
                    elif value.endswith("ft"):
                        floor_area = value.replace("ft", "")
                        floor_area_units = "ft"
                    elif value.endswith("m"):
                        floor_area = value.replace("m", "")
                        floor_area_units = "m"
            if floor_area:
                floor_area = int(floor_area)

            location = HousingListing.location_str_to_dict(result.css(".result-hood::text").get())
            if location:
                location = location.strip()
                # location looks like `(Vancouver)`. lets remove the parens
                location = location[1:] if location[0] == "(" else location
                location = location[:-1] if location[-1] == ")" else location
            currency = None
            for line in response.text.split("\n"):
                match = re.match(r'.*areaCountry = "(.*)".*', line)
                if match:
                    country = match.group(1)
                    if country.startswith("CA"):
                        currency = "CAD"
                    elif country.startswith("US"):
                        currency = "USD"
                
            price = result.css(".result-meta > .result-price::text").get()
            match = re.match(r".?([0-9]+).?", price)
            price = int(match.group(1))
            listing = HousingListing(
                id=int(result.css(".result-title").attrib["data-id"]),
                url=result.css(".result-title").attrib["href"],
                title=result.css(".result-title::text").get(),
                price=price,
                price_currency=currency,
                bedrooms=rooms,
                floor_area=floor_area,
                floor_area_units=floor_area_units,
                location=[location],
                datetime_scraped=datetime.utcnow(),
                partial_scrape=True,
                listing_type=self.listing_type
            )
            listings.append(listing)

        # take the list of scraped items and compare them to the database for stored values
        ids = [listing["id"] for listing in listings]
        with session_scope() as session:
            ids = [row[0] for row in session.query(Posting.id).filter(Posting.id.in_(ids)).all()]
            for listing in listings:
                if listing["id"] not in ids:
                    yield listing
                else:
                    logging.debug(f"Found an already scraped listing id={listing['id']}")

        # we pass along number of pages to scrape by using meta, if meta is not defined 
        if "number_of_pages_to_scrape" in response.meta:
            number_of_pages_to_scrape = response.meta.get("number_of_pages_to_scrape", None)
        else:
            number_of_pages_to_scrape = self.number_of_pages_to_scrape

        # if we configured it, scrape X pages
        next_anchor = response.css("a.next.button")
        if (number_of_pages_to_scrape or number_of_pages_to_scrape == -1) and next_anchor:
            url = urljoin(response.url, next_anchor.attrib["href"])
            yield Request(url, meta=dict(
                number_of_pages_to_scrape=number_of_pages_to_scrape-1 if number_of_pages_to_scrape != -1 else number_of_pages_to_scrape
            ))
예제 #4
0
def test_connection():
    with session_scope() as session:
        assert list(session.execute("select 1"))[0][0] == 1
예제 #5
0
 def start_requests(self):
     with session_scope() as session:
         for posting in session.query(Posting).filter(Posting.partial_scrape).all():
             yield scrapy.Request(posting.url, meta=dict(
                 listing_type=posting.listing_type
             ))