def run(self): _LOG.info("Find podcasts from %s", self.feed_url) parsed = feedparser.parse(self.feed_url) entries = parsed['entries'] num_entries = len(entries) for i, entry in enumerate(entries): published = entry['published'] link = entry['link'] title = entry['title'] _LOG.info( "(%s/%s) Download podcast %s from %s (published at: %s)", i + 1, num_entries, title, link, published, ) dt = dateutil_parser.parse(published) local_dt = dt.astimezone(tzlocal.get_localzone()) _, ext = os.path.splitext(link) new_filename = "{date} {title}{ext}".format( date=local_dt.strftime("%Y_%m_%d"), title=re.sub( ' +', ' ', re.sub('[^-_a-zA-Z0-9()]', ' ', title), ), ext=ext, ) new_filepath = os.path.join(self.out_dir, new_filename) tmp_new_filepath = new_filepath + ".tmp" _LOG.info( "Published date in local time: %s, download to file: [%s]", local_dt.strftime("%Y/%m/%d %H:%M:%S"), tmp_new_filepath, ) if os.path.exists(new_filepath): _LOG.info("[%s] already exists, skipped", new_filepath) continue webutil.download_file(link, tmp_new_filepath) _LOG.info("Rename [%s] to [%s]", tmp_new_filepath, new_filepath) os.rename(tmp_new_filepath, new_filepath)
def run(self): _LOG.info("Fetch links from url: %s", self.url) header = { # most common user-agent 'user-agent': 'Mozilla/5.0 ;Windows NT 6.1; WOW64; Trident/7.0; rv:11.0; like Gecko', } page = requests.get(self.url, headers=header) tree = html.fromstring(page.text) tree.make_links_absolute(self.url, resolve_base_href=True) elements_links = [] for element, attribute, link, pos in tree.iterlinks(): if self.regex.search(link): elements_links.append((element, link)) else: _LOG.debug("Skip [%s]", link) num_links = len(elements_links) for i, (element, link) in enumerate(elements_links): if self.regex.search(link): _LOG.info( "(%s/%s) %s (%s)", i + 1, num_links, element.text, link, ) new_filepath = os.path.join( self.out_dir, webutil.unquote_url(os.path.basename(link)), ) if os.path.exists(new_filepath): _LOG.info( "Skip [%s] since [%s] already exists", link, new_filepath, ) continue _LOG.info("Download to [%s]", new_filepath) if not self.dry_run: tmp_filepath = new_filepath + ".tmp" webutil.download_file(link, tmp_filepath) os.rename(tmp_filepath, new_filepath)