def download_url(self, url, savedirectory, embed="", poster=""): connector = http_connector.Connector() r_page = connector.reqhandler(url, 1) if r_page == '': raise IOError("Url not valid or nonexistent") # # be sure to have a url as a string and not as a list for savesource.py # url_string = connector.check_string_or_list(url) # Generate the directory for the source file and the images downloaded # Plus, return savedirectory as basedir + page title, so to save images # on a per-site basis # source_saver = savesource.SaveSource(r_page, savedirectory, url_string, creditor=poster) source_saver = savesource.SaveSource(r_page, savedirectory, url, creditor=poster) savedirectory = source_saver.link_save() # Parse the page for images parser = ImageHostParser(r_page, 'a', 'href', savedirectory) if embed: # do we need to search for embedded images then? # Note: at the moment it downloads thumbnails too print("Searching for embedded images") print("") embed_links = parser.get_all_links('img', 'src') parser.which_host(embed_links, 'src')
def __init__(self, link, basedir): self.link = link self.basedir = basedir self.connector = http_connector.Connector() self.logger = logging.getLogger('pyimagedownloader') # once we had 2 different functions for imagevenue's images coming from # paid hosts like usercash; now it's not needed anymore so # imagevenue_embed is just another name for parse() self.imagevenue_embed = self.parse()
def test_get_page_title(self): """test for extraction of webpage's title as a string or as unicode (for sites containing unicode characters in the title)""" connector = http_connector.Connector() response = connector.reqhandler(self.url, 1) response_utf8 = connector.reqhandler(self.utf8_url, 1) title = self.ss.get_page_title(response) title_utf8 = self.ss.get_page_title(response_utf8) self.assertIsInstance(title, str) self.assertIsInstance(title_utf8, unicode)
def test_bellazon_save_image(self): connector = http_connector.Connector() self.bz.bellazon_save_image(self.image_url) # get the filename to save on disk savefile = join( self.basedir, str(connector.get_filename(self.image_url, 'attach_id='))) # has the file been downloaded? self.assertTrue(isfile(savefile)) # check that file is bigger than 1K self.assertTrue(getsize(savefile) >= 1000)
def setUp(self): self.urllist = [ 'www.google.com', 'www.repubblica.it', 'www.corriere.it' ] self.url = 'http://fashionography.net/' self.image_url = 'http://4.bp.blogspot.com/-xi8CKQeMa9U/TiB5xL4pP6I/AAAAAAAAQek/VHabxbfyKlc/s400/Sophie%2BHolmes%2Bby%2BPasquale%2BAbbattista%2B%2528Jet-Set%2BChic%2B-%2BElle%2BGermany%2BJune%2B2011%2529.jpg' self.referer = 'http://twitter.com/#!/fshngrphy' self.values = {} self.user_agent = user_agent self.headers = { 'User-Agent': self.user_agent, 'Connection': 'Keep-Alive' } self.connector = http_connector.Connector()
def download_url(url, savedirectory, embed="", poster=""): """Main function to parse and download images""" # set up logging logger = logging.getLogger('pyimagedownloader') logger.setLevel(logging.INFO) formatter = logging.Formatter( '%(levelname)s:%(asctime)s:%(process)d:%(message)s') # trunk log file when it gets bigger than logmaxsize rh = logging.handlers.RotatingFileHandler(abspath(expanduser(logfile)), maxBytes=logmaxsize, backupCount=5) rh.setFormatter(formatter) logger.addHandler(rh) logger.info("Downloading from %s" % url[0]) connector = http_connector.Connector() r_page = connector.reqhandler(url, 1) if r_page == '': raise IOError("Url not valid or nonexistent") # be sure to have a url as a string and not as a list for savesource.py url_string = connector.check_string_or_list(url) # Generate the directory for the source file and the images downloaded # Plus, return savedirectory as basedir + page title, so to save images # on a per-site basis source_saver = savesource.SaveSource(r_page, savedirectory, url_string, creditor=poster) savedirectory = source_saver.link_save() # Parse the page for images parser = ImageHostParser(r_page, 'a', 'href', savedirectory) if embed: # do we need to search for embedded images then? # Note: at the moment it downloads thumbnails too embed_download(parser)
def download_url(self, url, savedirectory, embed="", poster=""): connector = http_connector.Connector() r_page = connector.reqhandler(url, 1) # Generate the directory for the source file and the images downloaded # Plus, return savedirectory as basedir + page title, so to save images # on a per-site basis source_saver = savesource.SaveSource(r_page, savedirectory, url, creditor=poster) savedirectory = source_saver.link_save() # Parse the page for images parser = ImageHostParser(r_page, 'a', 'href', savedirectory) if embed: # do we need to search for embedded images then? # Note: at the moment it downloads thumbnails too print("Searching for embedded images") print("") embed_links = parser.get_all_links('img', 'src') parser.which_host(embed_links, 'src')
def __init__(self, link, basedir): self.link = link self.basedir = basedir self.connector = http_connector.Connector() self.logger = logging.getLogger('pyimagedownloader')