Example #1
0
  def post(self):
    artwork_json = json.loads(self.request.get('json'))

    publish_date = (datetime.datetime
        .utcfromtimestamp(artwork_json['publishDate'] / 1000)
        .date())
    if FeaturedArtwork.all().filter('publish_date=', publish_date).get() != None:
      webapp2.abort(409, message='Artwork already exists for this date.')

    crop_tuple = tuple(float(x) for x in json.loads(self.request.get('crop')))

    new_image_url, new_thumb_url = maybe_process_image(
        artwork_json['imageUri'],
        crop_tuple,
        publish_date.strftime('%Y%m%d') + ' '
            + artwork_json['title'] + ' '
            + artwork_json['byline'])

    if not new_thumb_url and 'thumbUri' in artwork_json:
      new_thumb_url = artwork_json['thumbUri']
    new_artwork = FeaturedArtwork(
        title=artwork_json['title'],
        byline=artwork_json['byline'],
        attribution=artwork_json['attribution'] if 'attribution' in artwork_json else None,
        image_url=new_image_url,
        thumb_url=new_thumb_url,
        details_url=artwork_json['detailsUri'],
        publish_date=publish_date)
    new_artwork.save()
    self.response.set_status(200)
Example #2
0
  def process_html(self, url, html):
    soup = BeautifulSoup(html)

    details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
    title = soup.find(itemprop='name').get_text()
    author = soup.find(itemprop='author').get_text()
    completion_year_el = soup.find(itemprop='dateCreated')
    byline = author + ((', ' + completion_year_el.get_text()) if completion_year_el else '')
    image_url = soup.find(id='paintingImage')['href']

    if not title or not author or not image_url:
      self.response.out.write('Could not parse HTML')
      self.response.set_status(500)
      return

    publish_date = (datetime.datetime
        .utcfromtimestamp(int(self.request.get('publishDate')) / 1000)
        .date())
    image_url, thumb_url = maybe_process_image(image_url,
        NO_CROP_TUPLE,
        publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline)

    # create the artwork entry
    new_artwork = FeaturedArtwork(
        title=title,
        byline=byline,
        image_url=image_url,
        thumb_url=thumb_url,
        details_url=details_url,
        publish_date=publish_date)
    new_artwork.save()
    self.response.set_status(200)
Example #3
0
    def process_html(self, url, html):
        soup = BeautifulSoup(html)

        details_url = re.sub(r"#.+", "", url, re.I | re.S) + "?utm_source=Muzei&utm_campaign=Muzei"
        title = soup.select("h1 span")[0].get_text()
        author = soup.find(itemprop="author").get_text()
        completion_year_el = soup.find(itemprop="dateCreated")
        byline = author + ((", " + completion_year_el.get_text()) if completion_year_el else "")
        image_url = soup.find(id="paintingImage")["href"]

        if not title or not author or not image_url:
            self.response.out.write("Could not parse HTML")
            self.response.set_status(500)
            return

        publish_date = datetime.datetime.utcfromtimestamp(int(self.request.get("publishDate")) / 1000).date()
        image_url, thumb_url = maybe_process_image(
            image_url, NO_CROP_TUPLE, publish_date.strftime("%Y%m%d") + " " + title + " " + byline
        )

        # create the artwork entry
        new_artwork = FeaturedArtwork(
            title=title,
            byline=byline,
            image_url=image_url,
            thumb_url=thumb_url,
            details_url=details_url,
            publish_date=publish_date,
        )
        new_artwork.save()
        self.response.set_status(200)
Example #4
0
    def post(self):
        artwork_json = json.loads(self.request.get("json"))

        publish_date = datetime.datetime.utcfromtimestamp(artwork_json["publishDate"] / 1000).date()
        if FeaturedArtwork.all().filter("publish_date=", publish_date).get() != None:
            webapp2.abort(409, message="Artwork already exists for this date.")

        crop_tuple = tuple(float(x) for x in json.loads(self.request.get("crop")))

        new_image_url, new_thumb_url = backroomarthelper.maybe_process_image(
            artwork_json["imageUri"],
            crop_tuple,
            publish_date.strftime("%Y%m%d") + " " + artwork_json["title"] + " " + artwork_json["byline"],
        )

        if not new_thumb_url and "thumbUri" in artwork_json:
            new_thumb_url = artwork_json["thumbUri"]
        new_artwork = FeaturedArtwork(
            title=artwork_json["title"],
            byline=artwork_json["byline"],
            attribution=artwork_json["attribution"] if "attribution" in artwork_json else None,
            image_url=new_image_url,
            thumb_url=new_thumb_url,
            details_url=artwork_json["detailsUri"],
            publish_date=publish_date,
        )
        new_artwork.save()
        self.response.set_status(200)
Example #5
0
  def get(self):
    ARTWORKS = json.loads(open(os.path.join(os.path.split(__file__)[0], 'lt-artworks.json')).read())

    # ARTWORKS = filter(lambda a: '_stars' in a and a['_stars'] >= 1, ARTWORKS)

    # Fetch latest 300 artworks (for blacklisting)
    latest_artworks = (FeaturedArtwork.all()
        .order('-publish_date')
        .fetch(300))

    # List dates for which artwork exists
    dates_with_existing_art = set(a.publish_date for a in latest_artworks)

    # List target dates that we want artwork for, but for which no artwork exists
    target_dates = [date.today() + timedelta(days=n) for n in range(-1, LOOKAHEAD_DAYS)]
    target_dates = [d for d in target_dates if d not in dates_with_existing_art]

    # Create a blacklist of keys to avoid repeats
    blacklist = set(artwork_key(a.details_url) for a in latest_artworks)

    logging.debug('starting blacklist size: %d' % len(blacklist))

    chosen_artworks = []

    for target_date in target_dates:
      # Pick from available artworks, excluding artwork in the blacklist
      random_artwork = None
      while True:
        if len(ARTWORKS) == 0:
          logging.error('Ran out of artworks to choose from, cannot continue')
          return

        random_artwork = random.choice(ARTWORKS)
        key = artwork_key(random_artwork['detailsUri'])
        if key not in blacklist:
          # Once chosen, remove it from the list of artworks to choose next
          ARTWORKS.remove(random_artwork)
          chosen_artworks.append(random_artwork)
          break

      target_details_url = str(random_artwork['detailsUri'])
      logging.debug('%(date)s: setting to %(url)s' % dict(url=target_details_url, date=target_date))

      # Store the new artwork
      if self.request.get('dry-run', '') != 'true':
        new_artwork = FeaturedArtwork(
            title=random_artwork['title'],
            byline=random_artwork['byline'],
            attribution=random_artwork['attribution'],
            image_url=random_artwork['imageUri'],
            thumb_url=random_artwork['thumbUri'],
            details_url=random_artwork['detailsUri'],
            publish_date=target_date)
        new_artwork.save()

    if self.request.get('output', '') == 'html':
      self.response.out.write(get_html(artworks_json=json.dumps(chosen_artworks)))

    # Finish up
    logging.debug('done')
Example #6
0
    def process_html(self, url, html):
        soup = BeautifulSoup(html)

        details_url = re.sub(r'#.+', '', url, re.I
                             | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
        title = soup.find(itemprop='name').get_text()
        author = soup.find(itemprop='author').get_text()
        completion_year_el = soup.find(itemprop='dateCreated')
        byline = author + ((', ' + completion_year_el.get_text())
                           if completion_year_el else '')
        image_url = soup.find(id='paintingImage')['href']

        if not title or not author or not image_url:
            self.response.out.write('Could not parse HTML')
            self.response.set_status(500)
            return

        publish_date = (datetime.datetime.utcfromtimestamp(
            int(self.request.get('publishDate')) / 1000).date())
        image_url, thumb_url = maybe_process_image(
            image_url, NO_CROP_TUPLE,
            publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline)

        # create the artwork entry
        new_artwork = FeaturedArtwork(title=title,
                                      byline=byline,
                                      image_url=image_url,
                                      thumb_url=thumb_url,
                                      details_url=details_url,
                                      publish_date=publish_date)
        new_artwork.save()
        self.response.set_status(200)
Example #7
0
  def post(self):
    artwork_json = json.loads(self.request.get('json'))
    crop_tuple = tuple(float(x) for x in json.loads(self.request.get('crop')))
    publish_date = (datetime.datetime
        .utcfromtimestamp(artwork_json['publishDate'] / 1000)
        .date())

    new_image_url, new_thumb_url = maybe_process_image(
        artwork_json['imageUri'],
        crop_tuple,
        publish_date.strftime('%Y%m%d') + ' '
            + artwork_json['title'] + ' '
            + artwork_json['byline'])

    if not new_thumb_url and 'thumbUri' in artwork_json:
      new_thumb_url = artwork_json['thumbUri']
    new_artwork = FeaturedArtwork(
        title=artwork_json['title'],
        byline=artwork_json['byline'],
        image_url=new_image_url,
        thumb_url=new_thumb_url,
        details_url=artwork_json['detailsUri'],
        publish_date=publish_date)
    new_artwork.save()
    self.response.set_status(200)
Example #8
0
    def post(self):
        artwork_json = json.loads(self.request.get('json'))

        publish_date = (datetime.datetime.utcfromtimestamp(
            artwork_json['publishDate'] / 1000).date())
        if FeaturedArtwork.all().filter('publish_date=',
                                        publish_date).get() != None:
            webapp2.abort(409, message='Artwork already exists for this date.')

        crop_tuple = tuple(
            float(x) for x in json.loads(self.request.get('crop')))

        new_image_url, new_thumb_url = backroomarthelper.maybe_process_image(
            artwork_json['imageUri'], crop_tuple,
            publish_date.strftime('%Y%m%d') + ' ' + artwork_json['title'] +
            ' ' + artwork_json['byline'])

        if not new_thumb_url and 'thumbUri' in artwork_json:
            new_thumb_url = artwork_json['thumbUri']
        new_artwork = FeaturedArtwork(
            title=artwork_json['title'],
            byline=artwork_json['byline'],
            attribution=artwork_json['attribution']
            if 'attribution' in artwork_json else None,
            image_url=new_image_url,
            thumb_url=new_thumb_url,
            details_url=artwork_json['detailsUri'],
            publish_date=publish_date)
        new_artwork.save()
        self.response.set_status(200)
Example #9
0
def add_art_from_external_details_url(publish_date, url):
  if FeaturedArtwork.all().filter('publish_date =', publish_date).get() != None:
    webapp2.abort(409, message='Artwork already exists for this date.')

  result = urlfetch.fetch(url)
  if result.status_code < 200 or result.status_code >= 300:
    webapp2.abort(400, message='Error processing URL: HTTP %d. Content: %s'
        % (result.status_code, result.content))

  soup = BeautifulSoup(result.content)
  attribution = None

  if re.search(r'wikiart.org', url, re.I) or re.search(r'wikipaintings.org', url, re.I):
    attribution = 'wikiart.org'
    details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
    title = soup.select('h1 span')[0].get_text()
    author = soup.find(itemprop='author').get_text()
    completion_year_el = soup.find(itemprop='dateCreated')
    byline = author + ((', ' + completion_year_el.get_text()) if completion_year_el else '')
    image_url = soup.find(id='paintingImage')['href']
  elif re.search(r'metmuseum.org', url, re.I):
    attribution = 'metmuseum.org'
    details_url = re.sub(r'[#?].+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
    title = soup.find('h2').get_text()
    author = ''
    try:
      author = unicode(soup.find(text='Artist:').parent.next_sibling).strip()
    except:
      pass
    author = re.sub(r'\s*\(.*', '', author)
    completion_year_el = None
    try:
      completion_year_el = unicode(soup.find(text='Date:').parent.next_sibling).strip()
    except:
      pass
    byline = author + ((', ' + completion_year_el) if completion_year_el else '')
    image_url = soup.find('a', class_='download').attrs['href']
  else:
    webapp2.abort(400, message='Unrecognized URL')

  if not title or not author or not image_url:
    webapp2.abort(500, message='Could not parse HTML')

  image_url, thumb_url = maybe_process_image(image_url,
      NO_CROP_TUPLE,
      publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline)

  # create the artwork entry
  new_artwork = FeaturedArtwork(
      title=title,
      byline=byline,
      attribution=attribution,
      image_url=image_url,
      thumb_url=thumb_url,
      details_url=details_url,
      publish_date=publish_date)
  new_artwork.save()

  return new_artwork
Example #10
0
    def get(self):
        ARTWORKS = json.loads(
            open(os.path.join(os.path.split(__file__)[0],
                              'lt-artworks.json')).read())

        # Fetch latest 300 artworks (for blacklisting)
        latest_artworks = (
            FeaturedArtwork.all().order('-publish_date').fetch(300))

        # List dates for which artwork exists
        dates_with_existing_art = set(a.publish_date for a in latest_artworks)

        # List target dates that we want artwork for, but for which no artwork exists
        target_dates = [
            date.today() + timedelta(days=n)
            for n in range(-1, LOOKAHEAD_DAYS)
        ]
        target_dates = [
            d for d in target_dates if d not in dates_with_existing_art
        ]

        # Create a blacklist of keys to avoid repeats
        blacklist = set(artwork_key(a.details_url) for a in latest_artworks)

        self.response.out.write('starting blacklist size: %d<br>' %
                                len(blacklist))

        for target_date in target_dates:
            # Pick from available artworks, excluding artwork in the blacklist
            random_artwork = None
            while True:
                random_artwork = random.choice(ARTWORKS)
                key = artwork_key(random_artwork['detailsUri'])
                if key not in blacklist:
                    # Once chosen, add to the blacklist to avoid repeats within the lookahead
                    blacklist.add(key)
                    break

            target_details_url = str(random_artwork['detailsUri'])
            self.response.out.write(
                '%(date)s: setting to <b>%(url)s</b><br>' %
                dict(url=target_details_url, date=target_date))

            # Store the new artwork
            new_artwork = FeaturedArtwork(
                title=random_artwork['title'],
                byline=random_artwork['byline'],
                attribution=random_artwork['attribution'],
                image_url=random_artwork['imageUri'],
                thumb_url=random_artwork['thumbUri'],
                details_url=random_artwork['detailsUri'],
                publish_date=target_date)
            new_artwork.save()

        # Finish up
        self.response.out.write('done<br>')
 def post(self):
     artwork_json = json.loads(self.request.get('json'))
     new_artwork = FeaturedArtwork(
         title=artwork_json['title'],
         byline=artwork_json['byline'],
         image_url=artwork_json['imageUri'],
         thumb_url=(artwork_json['thumbUri'] if 'thumbUri' in artwork_json
                    else (artwork_json['imageUri'] + '!BlogSmall.jpg')),
         details_url=artwork_json['detailsUri'],
         publish_date=datetime.datetime.utcfromtimestamp(
             artwork_json['publishDate'] / 1000).date())
     new_artwork.save()
     self.response.set_status(200)
Example #12
0
 def post(self):
   artwork_json = json.loads(self.request.get('json'))
   new_artwork = FeaturedArtwork(
       title=artwork_json['title'],
       byline=artwork_json['byline'],
       image_url=artwork_json['imageUri'],
       thumb_url=(artwork_json['thumbUri'] if 'thumbUri' in artwork_json else None),
       details_url=artwork_json['detailsUri'],
       publish_date=datetime.datetime
           .utcfromtimestamp(artwork_json['publishDate'] / 1000)
           .date())
   new_artwork.save()
   self.response.set_status(200)
Example #13
0
    def process_html(self, url, html):
        soup = BeautifulSoup(html)

        if re.search(r'wikiart.org', url, re.I):
            details_url = re.sub(r'#.+', '', url, re.I |
                                 re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
            title = soup.select('h1 span')[0].get_text()
            author = soup.find(itemprop='author').get_text()
            completion_year_el = soup.find(itemprop='dateCreated')
            byline = author + ((', ' + completion_year_el.get_text())
                               if completion_year_el else '')
            image_url = soup.find(id='paintingImage')['href']
        elif re.search(r'metmuseum.org', url, re.I):
            details_url = re.sub(r'[#?].+', '', url, re.I |
                                 re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
            title = soup.find('h2').get_text()
            author = unicode(
                soup.find(text='Artist:').parent.next_sibling).strip()
            author = re.sub(r'\s*\(.*', '', author)
            completion_year_el = unicode(
                soup.find(text='Date:').parent.next_sibling).strip()
            byline = author + (
                (', ' + completion_year_el) if completion_year_el else '')
            image_url = soup.find('a', class_='download').attrs['href']
        else:
            self.response.out.write('Unrecognized URL')
            self.response.set_status(500)
            return

        if not title or not author or not image_url:
            self.response.out.write('Could not parse HTML')
            self.response.set_status(500)
            return

        publish_date = (datetime.datetime.utcfromtimestamp(
            int(self.request.get('publishDate')) / 1000).date())
        image_url, thumb_url = maybe_process_image(
            image_url, NO_CROP_TUPLE,
            publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline)

        # create the artwork entry
        new_artwork = FeaturedArtwork(title=title,
                                      byline=byline,
                                      image_url=image_url,
                                      thumb_url=thumb_url,
                                      details_url=details_url,
                                      publish_date=publish_date)
        new_artwork.save()

        self.response.set_status(200)
        self.response.out.write(json.dumps(artwork_dict(new_artwork)))
Example #14
0
  def get(self):
    ARTWORKS = json.loads(open(os.path.join(os.path.split(__file__)[0], 'lt-artworks.json')).read())

    # Fetch latest 300 artworks (for blacklisting)
    latest_artworks = (FeaturedArtwork.all()
        .order('-publish_date')
        .fetch(300))

    # List dates for which artwork exists
    dates_with_existing_art = set(a.publish_date for a in latest_artworks)

    # List target dates that we want artwork for, but for which no artwork exists
    target_dates = [date.today() + timedelta(days=n) for n in range(-1, LOOKAHEAD_DAYS)]
    target_dates = [d for d in target_dates if d not in dates_with_existing_art]

    # Create a blacklist of keys to avoid repeats
    blacklist = set(artwork_key(a.details_url) for a in latest_artworks)

    self.response.out.write('starting blacklist size: %d<br>' % len(blacklist))

    for target_date in target_dates:
      # Pick from available artworks, excluding artwork in the blacklist
      random_artwork = None
      while True:
        random_artwork = random.choice(ARTWORKS)
        key = artwork_key(random_artwork['detailsUri'])
        if key not in blacklist:
          # Once chosen, add to the blacklist to avoid repeats within the lookahead
          blacklist.add(key)
          break

      target_details_url = str(random_artwork['detailsUri'])
      self.response.out.write('%(date)s: setting to <b>%(url)s</b><br>' % dict(url=target_details_url, date=target_date))

      # Store the new artwork
      new_artwork = FeaturedArtwork(
          title=random_artwork['title'],
          byline=random_artwork['byline'],
          attribution=random_artwork['attribution'],
          image_url=random_artwork['imageUri'],
          thumb_url=random_artwork['thumbUri'],
          details_url=random_artwork['detailsUri'],
          publish_date=target_date)
      new_artwork.save()

    # Finish up
    self.response.out.write('done<br>')
Example #15
0
  def process_html(self, url, html):
    soup = BeautifulSoup(html)

    if re.search(r'wikiart.org', url, re.I):
      details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
      title = soup.select('h1 span')[0].get_text()
      author = soup.find(itemprop='author').get_text()
      completion_year_el = soup.find(itemprop='dateCreated')
      byline = author + ((', ' + completion_year_el.get_text()) if completion_year_el else '')
      image_url = soup.find(id='paintingImage')['href']
    elif re.search(r'metmuseum.org', url, re.I):
      details_url = re.sub(r'[#?].+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
      title = soup.find('h2').get_text()
      author = unicode(soup.find(text='Artist:').parent.next_sibling).strip()
      author = re.sub(r'\s*\(.*', '', author)
      completion_year_el = unicode(soup.find(text='Date:').parent.next_sibling).strip()
      byline = author + ((', ' + completion_year_el) if completion_year_el else '')
      image_url = soup.find('a', class_='download').attrs['href']
    else:
      self.response.out.write('Unrecognized URL')
      self.response.set_status(500)
      return      

    if not title or not author or not image_url:
      self.response.out.write('Could not parse HTML')
      self.response.set_status(500)
      return

    publish_date = (datetime.datetime
        .utcfromtimestamp(int(self.request.get('publishDate')) / 1000)
        .date())
    image_url, thumb_url = maybe_process_image(image_url,
        NO_CROP_TUPLE,
        publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline)

    # create the artwork entry
    new_artwork = FeaturedArtwork(
        title=title,
        byline=byline,
        image_url=image_url,
        thumb_url=thumb_url,
        details_url=details_url,
        publish_date=publish_date)
    new_artwork.save()

    self.response.set_status(200)
    self.response.out.write(json.dumps(artwork_dict(new_artwork)))
Example #16
0
 def post(self):
     artwork_json = json.loads(self.request.get('json'))
     new_image_url, new_thumb_url = maybe_process_image(
         artwork_json['imageUri'],
         artwork_json['title'] + ' ' + artwork_json['byline'])
     if not new_thumb_url and 'thumbUri' in artwork_json:
         new_thumb_url = artwork_json['thumbUri']
     new_artwork = FeaturedArtwork(
         title=artwork_json['title'],
         byline=artwork_json['byline'],
         image_url=new_image_url,
         thumb_url=new_thumb_url,
         details_url=artwork_json['detailsUri'],
         publish_date=datetime.datetime.utcfromtimestamp(
             artwork_json['publishDate'] / 1000).date())
     new_artwork.save()
     self.response.set_status(200)
Example #17
0
    def post(self):
        artwork_json = json.loads(self.request.get("json"))
        crop_tuple = tuple(float(x) for x in json.loads(self.request.get("crop")))
        publish_date = datetime.datetime.utcfromtimestamp(artwork_json["publishDate"] / 1000).date()

        new_image_url, new_thumb_url = maybe_process_image(
            artwork_json["imageUri"],
            crop_tuple,
            publish_date.strftime("%Y%m%d") + " " + artwork_json["title"] + " " + artwork_json["byline"],
        )

        if not new_thumb_url and "thumbUri" in artwork_json:
            new_thumb_url = artwork_json["thumbUri"]
        new_artwork = FeaturedArtwork(
            title=artwork_json["title"],
            byline=artwork_json["byline"],
            image_url=new_image_url,
            thumb_url=new_thumb_url,
            details_url=artwork_json["detailsUri"],
            publish_date=publish_date,
        )
        new_artwork.save()
        self.response.set_status(200)
Example #18
0
  def post(self):
    publish_date = (datetime.datetime
        .utcfromtimestamp(int(self.request.get('publishDate')) / 1000)
        .date())
    if FeaturedArtwork.all().filter('publish_date =', publish_date).get() != None:
      webapp2.abort(409, message='Artwork already exists for this date.')

    url = self.request.get('externalArtworkUrl')
    result = urlfetch.fetch(url)
    if result.status_code < 200 or result.status_code >= 300:
      webapp2.abort(400, message='Error processing URL: HTTP %d. Content: %s'
          % (result.status_code, result.content))

    soup = BeautifulSoup(result.content)
    attribution = None

    if re.search(r'wikiart.org', url, re.I):
      attribution = 'wikiart.org'
      details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
      title = soup.select('h1 span')[0].get_text()
      author = soup.find(itemprop='author').get_text()
      completion_year_el = soup.find(itemprop='dateCreated')
      byline = author + ((', ' + completion_year_el.get_text()) if completion_year_el else '')
      image_url = soup.find(id='paintingImage')['href']
    elif re.search(r'metmuseum.org', url, re.I):
      attribution = 'metmuseum.org'
      details_url = re.sub(r'[#?].+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
      title = soup.find('h2').get_text()
      author = ''
      try:
        author = unicode(soup.find(text='Artist:').parent.next_sibling).strip()
      except:
        pass
      author = re.sub(r'\s*\(.*', '', author)
      completion_year_el = None
      try:
        completion_year_el = unicode(soup.find(text='Date:').parent.next_sibling).strip()
      except:
        pass
      byline = author + ((', ' + completion_year_el) if completion_year_el else '')
      image_url = soup.find('a', class_='download').attrs['href']
    else:
      webapp2.abort(400, message='Unrecognized URL')

    if not title or not author or not image_url:
      webapp2.abort(500, message='Could not parse HTML')

    image_url, thumb_url = maybe_process_image(image_url,
        NO_CROP_TUPLE,
        publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline)

    # create the artwork entry
    new_artwork = FeaturedArtwork(
        title=title,
        byline=byline,
        attribution=attribution,
        image_url=image_url,
        thumb_url=thumb_url,
        details_url=details_url,
        publish_date=publish_date)
    new_artwork.save()

    self.response.set_status(200)
    self.response.out.write(json.dumps(artwork_dict(new_artwork)))
Example #19
0
def add_art_from_external_details_url(publish_date, url):
  if FeaturedArtwork.all().filter('publish_date =', publish_date).get() != None:
    webapp2.abort(409, message='Artwork already exists for this date.')

  result = urlfetch.fetch(url)
  if result.status_code < 200 or result.status_code >= 300:
    webapp2.abort(400, message='Error processing URL: HTTP %d. Content: %s'
        % (result.status_code, result.content))

  soup = BeautifulSoup(result.content, 'html.parser')
  attribution = None

  if re.search(r'wikiart.org', url, re.I) or re.search(r'wikipaintings.org', url, re.I):
    attribution = 'wikiart.org'
    details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
    title = soup.find('h1').get_text()
    author = soup.find('a', class_='artist-name').get_text()
    completion_year = None
    try:
      completion_year = unicode(soup.find(text='Date:').parent.next_sibling).strip()
    except:
      pass
    byline = author + ((', ' + completion_year) if completion_year else '')
    image_url = get_wikiart_image_url(soup)
  elif re.search(r'metmuseum.org', url, re.I):
    attribution = 'metmuseum.org'
    details_url = re.sub(r'[#?].+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
    title = soup.find('h2').get_text()
    author = ''
    try:
      author = unicode(soup.find(text='Artist:').parent.next_sibling).strip()
    except:
      pass
    author = re.sub(r'\s*\(.*', '', author)
    completion_year = None
    try:
      completion_year = unicode(soup.find(text='Date:').parent.next_sibling).strip()
    except:
      pass
    byline = author + ((', ' + completion_year) if completion_year else '')
    image_url = soup.find('a', class_='download').attrs['href']
  else:
    webapp2.abort(400, message='Unrecognized URL')

  if not title or not author or not image_url:
    webapp2.abort(500, message='Could not parse HTML')

  image_url, thumb_url = maybe_process_image(image_url,
      NO_CROP_TUPLE,
      publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline)

  # create the artwork entry
  new_artwork = FeaturedArtwork(
      title=title.strip(),
      byline=byline.strip(),
      attribution=attribution,
      image_url=image_url,
      thumb_url=thumb_url,
      details_url=details_url,
      publish_date=publish_date)
  new_artwork.save()

  return new_artwork
Example #20
0
    def get(self):
        ARTWORKS = json.loads(
            open(os.path.join(os.path.split(__file__)[0],
                              'lt-artworks.json')).read())

        # ARTWORKS = filter(lambda a: '_stars' in a and a['_stars'] >= 1, ARTWORKS)

        # Fetch latest 300 artworks (for blacklisting)
        latest_artworks = (
            FeaturedArtwork.all().order('-publish_date').fetch(300))

        # List dates for which artwork exists
        dates_with_existing_art = set(a.publish_date for a in latest_artworks)

        # List target dates that we want artwork for, but for which no artwork exists
        target_dates = [
            date.today() + timedelta(days=n)
            for n in range(-1, LOOKAHEAD_DAYS)
        ]
        target_dates = [
            d for d in target_dates if d not in dates_with_existing_art
        ]

        # Create a blacklist of keys to avoid repeats
        blacklist = set(artwork_key(a.details_url) for a in latest_artworks)

        logging.debug('starting blacklist size: %d' % len(blacklist))

        chosen_artworks = []

        for target_date in target_dates:
            # Pick from available artworks, excluding artwork in the blacklist
            random_artwork = None
            while True:
                if len(ARTWORKS) == 0:
                    logging.error(
                        'Ran out of artworks to choose from, cannot continue')
                    return

                random_artwork = random.choice(ARTWORKS)
                key = artwork_key(random_artwork['detailsUri'])
                if key not in blacklist:
                    # Once chosen, remove it from the list of artworks to choose next
                    ARTWORKS.remove(random_artwork)
                    chosen_artworks.append(random_artwork)
                    break

            target_details_url = str(random_artwork['detailsUri'])
            logging.debug('%(date)s: setting to %(url)s' %
                          dict(url=target_details_url, date=target_date))

            # Store the new artwork
            if self.request.get('dry-run', '') != 'true':
                new_artwork = FeaturedArtwork(
                    title=random_artwork['title'],
                    byline=random_artwork['byline'],
                    attribution=random_artwork['attribution'],
                    image_url=random_artwork['imageUri'],
                    thumb_url=random_artwork['thumbUri'],
                    details_url=random_artwork['detailsUri'],
                    publish_date=target_date)
                new_artwork.save()

        if self.request.get('output', '') == 'html':
            self.response.out.write(
                get_html(artworks_json=json.dumps(chosen_artworks)))

        # Finish up
        logging.debug('done')