def download_url(self, url, savedirectory, embed="", poster=""): connector = http_connector.Connector() r_page = connector.reqhandler(url, 1) if r_page == '': raise IOError("Url not valid or nonexistent") # # be sure to have a url as a string and not as a list for savesource.py # url_string = connector.check_string_or_list(url) # Generate the directory for the source file and the images downloaded # Plus, return savedirectory as basedir + page title, so to save images # on a per-site basis # source_saver = savesource.SaveSource(r_page, savedirectory, url_string, creditor=poster) source_saver = savesource.SaveSource(r_page, savedirectory, url, creditor=poster) savedirectory = source_saver.link_save() # Parse the page for images parser = ImageHostParser(r_page, 'a', 'href', savedirectory) if embed: # do we need to search for embedded images then? # Note: at the moment it downloads thumbnails too print("Searching for embedded images") print("") embed_links = parser.get_all_links('img', 'src') parser.which_host(embed_links, 'src')
def setUp(self): self.common_domain = 'http://www.google.com' self.brazilian_domain = 'http://www.tam.com.br' self.url = 'http://mode.newslicious.net/' self.utf8_url = 'http://mode.newslicious.net/2011/07/january-3-amanda-nrgaard-katrin.html' self.basedir = '/mnt/d/Maidens/Uploads/' self.creditor = 'nirari@celebrityforum' self.utf8_title = u'prover\xc3\xb2 a \xc3\xa6 in \xc3\xa5 for \xc3\x97' self.title_with_nonaccepted_chars = 'Try with [some] NOT accepted chars \' /\ ' self.nonaccepted_chars = '\/\'' self.ss = savesource.SaveSource(self.url, self.basedir, self.creditor)
def download_url(url, savedirectory, embed="", poster=""): """Main function to parse and download images""" # set up logging logger = logging.getLogger('pyimagedownloader') logger.setLevel(logging.INFO) formatter = logging.Formatter( '%(levelname)s:%(asctime)s:%(process)d:%(message)s') # trunk log file when it gets bigger than logmaxsize rh = logging.handlers.RotatingFileHandler(abspath(expanduser(logfile)), maxBytes=logmaxsize, backupCount=5) rh.setFormatter(formatter) logger.addHandler(rh) logger.info("Downloading from %s" % url[0]) connector = http_connector.Connector() r_page = connector.reqhandler(url, 1) if r_page == '': raise IOError("Url not valid or nonexistent") # be sure to have a url as a string and not as a list for savesource.py url_string = connector.check_string_or_list(url) # Generate the directory for the source file and the images downloaded # Plus, return savedirectory as basedir + page title, so to save images # on a per-site basis source_saver = savesource.SaveSource(r_page, savedirectory, url_string, creditor=poster) savedirectory = source_saver.link_save() # Parse the page for images parser = ImageHostParser(r_page, 'a', 'href', savedirectory) if embed: # do we need to search for embedded images then? # Note: at the moment it downloads thumbnails too embed_download(parser)
def download_url(self, url, savedirectory, embed="", poster=""): connector = http_connector.Connector() r_page = connector.reqhandler(url, 1) # Generate the directory for the source file and the images downloaded # Plus, return savedirectory as basedir + page title, so to save images # on a per-site basis source_saver = savesource.SaveSource(r_page, savedirectory, url, creditor=poster) savedirectory = source_saver.link_save() # Parse the page for images parser = ImageHostParser(r_page, 'a', 'href', savedirectory) if embed: # do we need to search for embedded images then? # Note: at the moment it downloads thumbnails too print("Searching for embedded images") print("") embed_links = parser.get_all_links('img', 'src') parser.which_host(embed_links, 'src')