def test_datetime_parser_uses_relative_keywords(): timezone_mock = MagicMock(spec=timezone) timezone_mock.now = MagicMock(return_value=datetime.datetime(2042, 1, 5, tzinfo=utc)) expected = datetime.datetime(2042, 1, 1, tzinfo=utc) actual = datetime_parser.get_datetime('4 days on market', _datetime_service=timezone_mock) assert expected == actual
def item_scraped(self, item, spider): url = item['url'] if url[0:6] == 'DOUBLE': duplicate = Listing.objects.filter(url=url[6:]).get() if timezone.now() - duplicate.created_date >= timedelta(hours=1): # if this listing is being crawled again, but we just recently created it, # don't stop crawling. This could happen if # we first encounter it on page 100.html but new posts appeared and forced this to 200.html, the next page. #get first item in last_updated_date or None if next((datetime_parser.get_datetime(x) for x in item['last_updated_date']), None) == \ duplicate.last_updated_date: # if this listing was created a long time ago but just recently re-newed or re-updated. if timezone.now() - duplicate.changed >= timedelta(hours=1): # This might happen if a listing was added a long time ago but then was re-posted today. Then we started # crawling and stumbled upon this listing and it was on, say, page 100.html. So we update the # last_updated_date to today. As we reach 200.html, this SAME, listing appears again because new posts # pushed # it from page 100.html to page 200.html. If we now see it again, we know we still have to keep crawling. # However, we should inspect something other than listing.changed because changed could be updated from the # admin and would cause us to continue parsing even though we should've stopped. So we could perhaps store # that value in the last_updated_date_recorded_datetime prop. But the likelihood of that happening is rare # because we'd need to manually modify a listing minutes after it shows up as a re-post. self.crawler.engine.close_spider(spider, 'duplicate listing found: {0}'.format(url))
def update_listing(**listing_attrs): url = listing_attrs['url'] listing = get_listing_by_url(url) last_updated_date = listing_attrs['last_updated_date'] if last_updated_date: listing.update_last_updated_date(datetime_parser.get_datetime(last_updated_date[0])) save_or_update(listing) return listing
def test_datetime_parser_detects_correct_timezone(input_values, expected): assert expected == datetime_parser.get_datetime(input_values)