def get_text(self): if self.conf_lyrics_done: print 'Lyrics are already done' return self.conf_lyrics_done bing = BingSearchAPI() tags = self.conf_tags search = '%s lyrics %s' % (tags['title'], tags['performer']) print 'Searching for lyrics. Search string: %s' % search lyrics_search = bing.search('web', search.encode('utf-8'), {'$format': 'json'}) #print 'Lyrics search result: %s' % pformat(lyrics_search) for result in lyrics_search.get('d', {}).get('results', [{}])[0].get('Web', []): url = result['Url'] print 'lyrics in %s?' % url for match, (good_attr, bad_part) in lyrics_matches.items(): if match in url: # Good! We have a known site with lyrics - let's extract them. print 'yes, lyrics are probably here' browser = Browser() browser.set_handle_robots(None) browser.open(url) text = browser.response().read() soup = BeautifulSoup(text, convertEntities=BeautifulSoup.HTML_ENTITIES) lyrics_el = soup.find(attrs=good_attr) if not lyrics_el: #print 'Not found lyrics in %s' % text continue #print 'full text: %s' % text #print 'Found something like this: %s' % lyrics_el parts = list(self.extract_text_parts(lyrics_el.contents, bad_part)) lyrics = '\n'.join(parts) #print 'Found lyrics: \n%s' % lyrics print 'Found lyrics: %s...' % lyrics[:150] self.conf_lyrics = lyrics self.conf_lyrics_done = True return self.conf_lyrics_done print 'Unsupported lyrics source: %s' % url if not self.conf_lyrics_done: print 'ERROR: lyrics not found! %s' % self.conf_tags['title'] return self.conf_lyrics_done
def get_pics(self): if self.conf_pics_done: print 'Pics are already done' return self.conf_pics_done imgdir = self.imgdir if len(glob.glob1(imgdir, "*.png")) > REQUIRED_IMAGE_COUNT: self.conf_pics_done = True return self.conf_pics_done bing = BingSearchAPI() tags = self.conf_tags search = '%s %s' % (tags['title'], tags['performer']) print 'Searching for images. Search string: %s' % search img_search = bing.search('image', search.encode('utf-8'), {'$format': 'json'}) print 'Images: %s' % pformat(img_search) registry = processed_image_urls.setdefault(imgdir, set()) if not os.path.exists(imgdir): os.makedirs(imgdir) for result in img_search.get('d', {}).get('results', [{}])[0].get('Image', []): if result['MediaUrl'] not in registry: browser = Browser() browser.set_handle_robots(None) registry.add(result['MediaUrl']) log.debug('%s images in %s' % (len(glob.glob1(imgdir, "*.png")), imgdir)) try: #log.debug('Opening %s' % result['SourceUrl']) browser.open(result['SourceUrl']) #log.debug('Opening %s' % result['MediaUrl']) img = Image.open(browser.open(result['MediaUrl'])) if img.size[0] >= DEFAULT_VIDEO_RESOLUTION and img.size[1] >= DEFAULT_VIDEO_RESOLUTION[1]: print 'Found image: %s' % result['MediaUrl'] img.save(os.path.join(imgdir, ('image%03d.png' % (len(glob.glob1(imgdir, "*.png"))) + 1))) self.conf_pics_done = True if len(glob.glob1(imgdir, "*.png")) > REQUIRED_IMAGE_COUNT: self.conf_pics_done = True break except: print_exc() if len(glob.glob1(imgdir, "*.png")) < REQUIRED_IMAGE_COUNT: search = tags['performer'] print 'Searching for images. Search string: %s' % search img_search = bing.search('image', search.encode('utf-8'), {'$format': 'json'}) for result in img_search.get('d', {}).get('results', [{}])[0].get('Image', []): if result['MediaUrl'] not in registry: browser = Browser() browser.set_handle_robots(None) registry.add(result['MediaUrl']) log.debug('%s images in %s' % (len(glob.glob1(imgdir, "*.png")), imgdir)) try: #log.debug('Opening %s' % result['SourceUrl']) browser.open(result['SourceUrl']) #log.debug('Opening %s' % result['MediaUrl']) img = Image.open(browser.open(result['MediaUrl'])) if img.size[0] >= DEFAULT_VIDEO_RESOLUTION[0] and img.size[1] >= DEFAULT_VIDEO_RESOLUTION[1]: print 'Found image: %s' % result['MediaUrl'] img.save(os.path.join(imgdir, ('image%03d.png' % (len(glob.glob1(imgdir, "*.png"))) + 1))) if len(glob.glob1(imgdir, "*.png")) > REQUIRED_IMAGE_COUNT: self.conf_pics_done = True break except: print_exc() return self.conf_pics_done