def test_datetime_parser_uses_relative_keywords():
  timezone_mock = MagicMock(spec=timezone)
  timezone_mock.now = MagicMock(return_value=datetime.datetime(2042, 1, 5, tzinfo=utc))

  expected = datetime.datetime(2042, 1, 1, tzinfo=utc)
  actual = datetime_parser.get_datetime('4 days on market', _datetime_service=timezone_mock)
  assert expected == actual
Beispiel #2
0
  def item_scraped(self, item, spider):
    url = item['url']
    if url[0:6] == 'DOUBLE':
      duplicate = Listing.objects.filter(url=url[6:]).get()

      if timezone.now() - duplicate.created_date >= timedelta(hours=1):
        # if this listing is being crawled again, but we just recently created it,
        # don't stop crawling. This could happen if
        # we first encounter it on page 100.html but new posts appeared and forced this to 200.html, the next page.

        #get first item in last_updated_date or None
        if next((datetime_parser.get_datetime(x) for x in item['last_updated_date']), None) == \
            duplicate.last_updated_date:
        # if this listing was created a long time ago but just recently re-newed or re-updated.

          if timezone.now() - duplicate.changed >= timedelta(hours=1):
            # This might happen if a listing was added a long time ago but then was re-posted today. Then we started
            # crawling and stumbled upon this listing and it was on, say, page 100.html. So we update the
            # last_updated_date to today. As we reach 200.html, this SAME, listing appears again because new posts
            # pushed
            # it from page 100.html to page 200.html. If we now see it again, we know we still have to keep crawling.
            # However, we should inspect something other than listing.changed because changed could be updated from the
            # admin and would cause us to continue parsing even though we should've stopped. So we could perhaps store
            # that value in the last_updated_date_recorded_datetime prop. But the likelihood of that happening is rare
            # because we'd need to manually modify a listing minutes after it shows up as a re-post.

            self.crawler.engine.close_spider(spider, 'duplicate listing found: {0}'.format(url))
def update_listing(**listing_attrs):
  url = listing_attrs['url']
  listing = get_listing_by_url(url)

  last_updated_date = listing_attrs['last_updated_date']

  if last_updated_date:
    listing.update_last_updated_date(datetime_parser.get_datetime(last_updated_date[0]))
    save_or_update(listing)

  return listing
def test_datetime_parser_detects_correct_timezone(input_values, expected):
  assert expected == datetime_parser.get_datetime(input_values)