Ejemplo n.º 1
0
 def get_text(self):
     if self.conf_lyrics_done:
         print 'Lyrics are already done'
         return self.conf_lyrics_done
     bing = BingSearchAPI()
     tags = self.conf_tags
     search = '%s lyrics %s' % (tags['title'], tags['performer'])
     print 'Searching for lyrics. Search string: %s' % search
     lyrics_search = bing.search('web', search.encode('utf-8'), {'$format': 'json'})
     #print 'Lyrics search result: %s' % pformat(lyrics_search)
     for result in lyrics_search.get('d', {}).get('results', [{}])[0].get('Web', []):
         url = result['Url']
         print 'lyrics in %s?' % url
         for match, (good_attr, bad_part) in lyrics_matches.items():
             if match in url:
                 # Good! We have a known site with lyrics - let's extract them.
                 print 'yes, lyrics are probably here'
                 browser = Browser()
                 browser.set_handle_robots(None)
                 browser.open(url)
                 text = browser.response().read()
                 soup = BeautifulSoup(text, convertEntities=BeautifulSoup.HTML_ENTITIES)
                 lyrics_el = soup.find(attrs=good_attr)
                 if not lyrics_el:
                     #print 'Not found lyrics in %s' % text
                     continue
                 #print 'full text: %s' % text
                 #print 'Found something like this: %s' % lyrics_el
                 parts = list(self.extract_text_parts(lyrics_el.contents, bad_part))
                 lyrics = '\n'.join(parts)
                 #print 'Found lyrics: \n%s' % lyrics
                 print 'Found lyrics: %s...' % lyrics[:150]
                 self.conf_lyrics = lyrics
                 self.conf_lyrics_done = True
                 return self.conf_lyrics_done
         print 'Unsupported lyrics source: %s' % url
     if not self.conf_lyrics_done:
         print 'ERROR: lyrics not found! %s' % self.conf_tags['title']
     return self.conf_lyrics_done
Ejemplo n.º 2
0
 def get_pics(self):
     if self.conf_pics_done:
         print 'Pics are already done'
         return self.conf_pics_done
     imgdir = self.imgdir
     if len(glob.glob1(imgdir, "*.png")) > REQUIRED_IMAGE_COUNT:
         self.conf_pics_done = True
         return self.conf_pics_done
     bing = BingSearchAPI()
     tags = self.conf_tags
     search = '%s %s' % (tags['title'], tags['performer'])
     print 'Searching for images. Search string: %s' % search
     img_search = bing.search('image', search.encode('utf-8'), {'$format': 'json'})
     print 'Images: %s' % pformat(img_search)
     registry = processed_image_urls.setdefault(imgdir, set())
     if not os.path.exists(imgdir):
         os.makedirs(imgdir)
     for result in img_search.get('d', {}).get('results', [{}])[0].get('Image', []):
         if result['MediaUrl'] not in registry:
             browser = Browser()
             browser.set_handle_robots(None)
             registry.add(result['MediaUrl'])
             log.debug('%s images in %s' % (len(glob.glob1(imgdir, "*.png")), imgdir))
             try:
                 #log.debug('Opening %s' % result['SourceUrl'])
                 browser.open(result['SourceUrl'])
                 #log.debug('Opening %s' % result['MediaUrl'])
                 img = Image.open(browser.open(result['MediaUrl']))
                 if img.size[0] >= DEFAULT_VIDEO_RESOLUTION and img.size[1] >= DEFAULT_VIDEO_RESOLUTION[1]:
                     print 'Found image: %s' % result['MediaUrl']
                     img.save(os.path.join(imgdir, ('image%03d.png'
                         % (len(glob.glob1(imgdir, "*.png"))) + 1)))
                     self.conf_pics_done = True
                     if len(glob.glob1(imgdir, "*.png")) > REQUIRED_IMAGE_COUNT:
                         self.conf_pics_done = True
                         break
             except:
                 print_exc()
     if len(glob.glob1(imgdir, "*.png")) < REQUIRED_IMAGE_COUNT:
         search = tags['performer']
         print 'Searching for images. Search string: %s' % search
         img_search = bing.search('image', search.encode('utf-8'), {'$format': 'json'})
         for result in img_search.get('d', {}).get('results', [{}])[0].get('Image', []):
             if result['MediaUrl'] not in registry:
                 browser = Browser()
                 browser.set_handle_robots(None)
                 registry.add(result['MediaUrl'])
                 log.debug('%s images in %s' % (len(glob.glob1(imgdir, "*.png")), imgdir))
                 try:
                     #log.debug('Opening %s' % result['SourceUrl'])
                     browser.open(result['SourceUrl'])
                     #log.debug('Opening %s' % result['MediaUrl'])
                     img = Image.open(browser.open(result['MediaUrl']))
                     if img.size[0] >= DEFAULT_VIDEO_RESOLUTION[0] and img.size[1] >= DEFAULT_VIDEO_RESOLUTION[1]:
                         print 'Found image: %s' % result['MediaUrl']
                         img.save(os.path.join(imgdir, ('image%03d.png'
                             % (len(glob.glob1(imgdir, "*.png"))) + 1)))
                         if len(glob.glob1(imgdir, "*.png")) > REQUIRED_IMAGE_COUNT:
                             self.conf_pics_done = True
                             break
                 except:
                     print_exc()
     return self.conf_pics_done