Esempio n. 1
0
    def download_url(self, url, savedirectory, embed="", poster=""):

        connector = http_connector.Connector()
        r_page = connector.reqhandler(url, 1)
        if r_page == '':
            raise IOError("Url not valid or nonexistent")


#        # be sure to have a url as a string and not as a list for savesource.py
#        url_string = connector.check_string_or_list(url)
# Generate the directory for the source file and the images downloaded
# Plus, return savedirectory as basedir + page title, so to save images
# on a per-site basis
#        source_saver = savesource.SaveSource(r_page, savedirectory, url_string, creditor=poster)
        source_saver = savesource.SaveSource(r_page,
                                             savedirectory,
                                             url,
                                             creditor=poster)
        savedirectory = source_saver.link_save()

        # Parse the page for images
        parser = ImageHostParser(r_page, 'a', 'href', savedirectory)
        if embed:
            # do we need to search for embedded images then?
            # Note: at the moment it downloads thumbnails too
            print("Searching for embedded images")
            print("")
            embed_links = parser.get_all_links('img', 'src')
            parser.which_host(embed_links, 'src')
 def setUp(self):
     self.common_domain = 'http://www.google.com'
     self.brazilian_domain = 'http://www.tam.com.br'
     self.url = 'http://mode.newslicious.net/'
     self.utf8_url = 'http://mode.newslicious.net/2011/07/january-3-amanda-nrgaard-katrin.html'
     self.basedir = '/mnt/d/Maidens/Uploads/'
     self.creditor = 'nirari@celebrityforum'
     self.utf8_title = u'prover\xc3\xb2 a \xc3\xa6 in \xc3\xa5 for \xc3\x97'
     self.title_with_nonaccepted_chars = 'Try with [some] NOT accepted chars \' /\ '
     self.nonaccepted_chars = '\/\''
     self.ss = savesource.SaveSource(self.url, self.basedir, self.creditor)
def download_url(url, savedirectory, embed="", poster=""):
    """Main function to parse and download images"""

    # set up logging
    logger = logging.getLogger('pyimagedownloader')
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(levelname)s:%(asctime)s:%(process)d:%(message)s')
    # trunk log file when it gets bigger than logmaxsize
    rh = logging.handlers.RotatingFileHandler(abspath(expanduser(logfile)),
                                              maxBytes=logmaxsize,
                                              backupCount=5)
    rh.setFormatter(formatter)
    logger.addHandler(rh)

    logger.info("Downloading from %s" % url[0])

    connector = http_connector.Connector()
    r_page = connector.reqhandler(url, 1)
    if r_page == '':
        raise IOError("Url not valid or nonexistent")

    # be sure to have a url as a string and not as a list for savesource.py
    url_string = connector.check_string_or_list(url)
    # Generate the directory for the source file and the images downloaded
    # Plus, return savedirectory as basedir + page title, so to save images
    # on a per-site basis
    source_saver = savesource.SaveSource(r_page,
                                         savedirectory,
                                         url_string,
                                         creditor=poster)
    savedirectory = source_saver.link_save()

    # Parse the page for images
    parser = ImageHostParser(r_page, 'a', 'href', savedirectory)
    if embed:
        # do we need to search for embedded images then?
        # Note: at the moment it downloads thumbnails too
        embed_download(parser)
Esempio n. 4
0
    def download_url(self, url, savedirectory, embed="", poster=""):

        connector = http_connector.Connector()
        r_page = connector.reqhandler(url, 1)

        # Generate the directory for the source file and the images downloaded
        # Plus, return savedirectory as basedir + page title, so to save images
        # on a per-site basis
        source_saver = savesource.SaveSource(r_page,
                                             savedirectory,
                                             url,
                                             creditor=poster)
        savedirectory = source_saver.link_save()

        # Parse the page for images
        parser = ImageHostParser(r_page, 'a', 'href', savedirectory)
        if embed:
            # do we need to search for embedded images then?
            # Note: at the moment it downloads thumbnails too
            print("Searching for embedded images")
            print("")
            embed_links = parser.get_all_links('img', 'src')
            parser.which_host(embed_links, 'src')