def update_playlist(playlist, logger=None): """ Reads a playlist and attempts to find accurate release data on each song. """ hist = History('history.p') d = Discogs(logger=logger) for song in read_playlist(playlist): if hist.check_recent(song.loc): continue try: artist = song.meta.tag.artist songname = song.meta.tag.title except AttributeError: continue release = d.get_first_release(artist, songname) if release: update = build_update(songname, release) song.update_info(update) song.save() hist.store(song.loc)
class Gears(object): """Gears for scrapers""" def __init__(self, logger=None, hist_file='history.p'): """Can pass in an external logger""" if logger: self.logger = logger else: self.logger = logging.getLogger('temp.log') self.history = History(hist_file, logger=self.logger) def get(self, url, referer=None, agent=None, delay=True, check_hist=True): """ I keep using this pattern to scrape pages """ if not agent: agent = ragent() if delay: rdelay() if check_hist: if self.history.check_recent(url) is not False: return None headers = {'User-Agent': agent, 'referer': referer} self.logger.debug('Making request to %s\nwith headers:%s', url, headers) try: response = requests.get(url, headers=headers) except requests.exceptions.ConnectionError: self.logger.error('ConnectionError', exc_info=True) else: if response.status_code != 200: #pragma: no cover self.logger.error( 'Request != 200: status_code = %s', response.status_code ) self.logger.error(response.text) return response def parse_page(self, url, xpath=None, text=None, suffix='">', referer=None, delay=True, check_hist=True): """ Generalized version of tracklists method """ self.logger.debug('Parsing page %s', url) response = self.get(url, referer=referer, delay=delay, check_hist=check_hist) if not response: return [] if xpath: self.logger.debug('with xpath=%s', xpath) tree = html.fromstring(response.text) elements = tree.xpath(xpath) if elements: # pragma: no cover self.logger.debug('Found %d elements', len(elements)) return elements else: # pragma: no cover self.logger.debug('Found nothing') return [] elif text: # pragma: no cover return self.find_string(response.text, text, suffix) def find_string(self, raw_text, prefix, suffix='">'): """Finds a string from raw HTML text""" self.logger.debug('Finding string between %s\nand\n%s', prefix, suffix) try: idx = raw_text.index(prefix)+len(prefix) found = raw_text[idx:].split(suffix)[0] except (ValueError, IndexError, AttributeError), err: self.logger.error( 'String not found due to error: %s', err, exc_info=True ) self.logger.debug('Raw text: %s', raw_text) return None else: