Exemple #1
0
    def download_url(self, url, savedirectory, embed="", poster=""):

        connector = http_connector.Connector()
        r_page = connector.reqhandler(url, 1)
        if r_page == '':
            raise IOError("Url not valid or nonexistent")


#        # be sure to have a url as a string and not as a list for savesource.py
#        url_string = connector.check_string_or_list(url)
# Generate the directory for the source file and the images downloaded
# Plus, return savedirectory as basedir + page title, so to save images
# on a per-site basis
#        source_saver = savesource.SaveSource(r_page, savedirectory, url_string, creditor=poster)
        source_saver = savesource.SaveSource(r_page,
                                             savedirectory,
                                             url,
                                             creditor=poster)
        savedirectory = source_saver.link_save()

        # Parse the page for images
        parser = ImageHostParser(r_page, 'a', 'href', savedirectory)
        if embed:
            # do we need to search for embedded images then?
            # Note: at the moment it downloads thumbnails too
            print("Searching for embedded images")
            print("")
            embed_links = parser.get_all_links('img', 'src')
            parser.which_host(embed_links, 'src')
 def __init__(self, link, basedir):
     self.link = link
     self.basedir = basedir
     self.connector = http_connector.Connector()
     self.logger = logging.getLogger('pyimagedownloader')
     # once we had 2 different functions for imagevenue's images coming from
     # paid hosts like usercash; now it's not needed anymore so
     # imagevenue_embed is just another name for parse()
     self.imagevenue_embed = self.parse()
 def test_get_page_title(self):
     """test for extraction of webpage's title as a string or as unicode
     (for sites containing unicode characters in the title)"""
     connector = http_connector.Connector()
     response = connector.reqhandler(self.url, 1)
     response_utf8 = connector.reqhandler(self.utf8_url, 1)
     title = self.ss.get_page_title(response)
     title_utf8 = self.ss.get_page_title(response_utf8)
     self.assertIsInstance(title, str)
     self.assertIsInstance(title_utf8, unicode)
Exemple #4
0
 def test_bellazon_save_image(self):
     connector = http_connector.Connector()
     self.bz.bellazon_save_image(self.image_url)
     # get the filename to save on disk
     savefile = join(
         self.basedir,
         str(connector.get_filename(self.image_url, 'attach_id=')))
     # has the file been downloaded?
     self.assertTrue(isfile(savefile))
     # check that file is bigger than 1K
     self.assertTrue(getsize(savefile) >= 1000)
 def setUp(self):
     self.urllist = [
         'www.google.com', 'www.repubblica.it', 'www.corriere.it'
     ]
     self.url = 'http://fashionography.net/'
     self.image_url = 'http://4.bp.blogspot.com/-xi8CKQeMa9U/TiB5xL4pP6I/AAAAAAAAQek/VHabxbfyKlc/s400/Sophie%2BHolmes%2Bby%2BPasquale%2BAbbattista%2B%2528Jet-Set%2BChic%2B-%2BElle%2BGermany%2BJune%2B2011%2529.jpg'
     self.referer = 'http://twitter.com/#!/fshngrphy'
     self.values = {}
     self.user_agent = user_agent
     self.headers = {
         'User-Agent': self.user_agent,
         'Connection': 'Keep-Alive'
     }
     self.connector = http_connector.Connector()
def download_url(url, savedirectory, embed="", poster=""):
    """Main function to parse and download images"""

    # set up logging
    logger = logging.getLogger('pyimagedownloader')
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(levelname)s:%(asctime)s:%(process)d:%(message)s')
    # trunk log file when it gets bigger than logmaxsize
    rh = logging.handlers.RotatingFileHandler(abspath(expanduser(logfile)),
                                              maxBytes=logmaxsize,
                                              backupCount=5)
    rh.setFormatter(formatter)
    logger.addHandler(rh)

    logger.info("Downloading from %s" % url[0])

    connector = http_connector.Connector()
    r_page = connector.reqhandler(url, 1)
    if r_page == '':
        raise IOError("Url not valid or nonexistent")

    # be sure to have a url as a string and not as a list for savesource.py
    url_string = connector.check_string_or_list(url)
    # Generate the directory for the source file and the images downloaded
    # Plus, return savedirectory as basedir + page title, so to save images
    # on a per-site basis
    source_saver = savesource.SaveSource(r_page,
                                         savedirectory,
                                         url_string,
                                         creditor=poster)
    savedirectory = source_saver.link_save()

    # Parse the page for images
    parser = ImageHostParser(r_page, 'a', 'href', savedirectory)
    if embed:
        # do we need to search for embedded images then?
        # Note: at the moment it downloads thumbnails too
        embed_download(parser)
    def download_url(self, url, savedirectory, embed="", poster=""):

        connector = http_connector.Connector()
        r_page = connector.reqhandler(url, 1)

        # Generate the directory for the source file and the images downloaded
        # Plus, return savedirectory as basedir + page title, so to save images
        # on a per-site basis
        source_saver = savesource.SaveSource(r_page,
                                             savedirectory,
                                             url,
                                             creditor=poster)
        savedirectory = source_saver.link_save()

        # Parse the page for images
        parser = ImageHostParser(r_page, 'a', 'href', savedirectory)
        if embed:
            # do we need to search for embedded images then?
            # Note: at the moment it downloads thumbnails too
            print("Searching for embedded images")
            print("")
            embed_links = parser.get_all_links('img', 'src')
            parser.which_host(embed_links, 'src')
Exemple #8
0
 def __init__(self, link, basedir):
     self.link = link
     self.basedir = basedir
     self.connector = http_connector.Connector()
     self.logger = logging.getLogger('pyimagedownloader')