Python HardDict Examples, HardDict.HardDict Python Examples

Example #1

0

Show file

File: DownloadLikes.py Project: hjk41/tumblrvideodownloader

 def __init__(self, downloaddir, blogName, secretFile, dbdir = 'db', numRetriesPerUrl = 2):
     self.dbdir = dbdir
     if (not os.path.exists(self.dbdir)):
         os.makedirs(self.dbdir)
     self.downloaddir = downloaddir
     if (not os.path.exists(self.downloaddir)):
         os.makedirs(self.downloaddir)
     self.downloadedDB = HardDict(os.path.join(self.dbdir, "download.dat"))
     self.numRetriesPerUrl = numRetriesPerUrl
     self.blogName = blogName
     self.secretFile = secretFile

Example #2

0

Show file

File: DownloadDaemon.py Project: hjk41/tumblrvideodownloader

 def __init__(self, downloaddir, urlfile, dbdir = 'db', urlLoadInterval = 5, numDownloadThreads = 5, numRetriesPerUrl = 2):
     self.dbdir = dbdir
     if (not os.path.exists(self.dbdir)):
         os.makedirs(self.dbdir)
     self.downloaddir = downloaddir
     if (not os.path.exists(self.downloaddir)):
         os.makedirs(self.downloaddir)
     self.downloadedDB = HardDict(os.path.join(self.dbdir, "download.dat"))
     self.downloadedDBLock = threading.Lock()
     self.newDB = HardDict(os.path.join(self.dbdir, "new.dat"));
     self.newDBLock = threading.Lock()
     self.taskQueue = queue.Queue()
     self.finishQueue = queue.Queue()
     self.urlfile = urlfile
     self.urlLoadInterval = urlLoadInterval
     self.numDownloadThreads = numDownloadThreads
     self.numRetriesPerUrl = numRetriesPerUrl
     self.halt = False
     # clear partially downloaded files
     for u,p in self.newDB.items():
         self.newDB[u] = ''
     self.FlushDict()

Example #3

0

Show file

File: DownloadLikes.py Project: hjk41/tumblrvideodownloader

class DownloadLikes:
    def __init__(self, downloaddir, blogName, secretFile, dbdir = 'db', numRetriesPerUrl = 2):
        self.dbdir = dbdir
        if (not os.path.exists(self.dbdir)):
            os.makedirs(self.dbdir)
        self.downloaddir = downloaddir
        if (not os.path.exists(self.downloaddir)):
            os.makedirs(self.downloaddir)
        self.downloadedDB = HardDict(os.path.join(self.dbdir, "download.dat"))
        self.numRetriesPerUrl = numRetriesPerUrl
        self.blogName = blogName
        self.secretFile = secretFile
    
    def close(self):
        logging.info("closing...")
        self.downloadedDB.close()

    def FlushDict(self):
      self.downloadedDB.flush()

    def MakeLocalPath(self, url):
        fname = url.split('/')[-1].strip("\r\n")
        fname = fname[0:min(len(fname) - 1, 20)]
        fname = fname + ".mp4"
        localpath = self.downloaddir + "/" + fname
        return localpath

    def Download(self):
        try:
            # get liked urls
            liked = self.getLikes()
            for url in liked:
                if url in self.downloadedDB:
                    logging.info("url {} has already been downloaded".format(url))
                    continue
                logging.warn("downloading {}".format(url))
                success = False
                for i in range(self.numRetriesPerUrl):
                    try:
                        self.downloadOneUrl(url, self.MakeLocalPath(url))
                        self.downloadedDB[url] = url
                        success = True
                        break
                    except KeyboardInterrupt as e:
                        raise e
                    except:
                        logging.warn("error occurred on {}th downloading from {}".format(i, url))
                if not success:
                    with open('failed.txt', 'w') as f:
                        f.write(url)
        except(KeyboardInterrupt, SystemExit):
            raise
        except:
            logging.warn("error occurred when loading urls")
            traceback.print_exc() 

    """
        download a url to local_path
    """
    def downloadOneUrl(self, url, local_path):
        tumblr.downloadTumblrVideo(url, local_path)
        
    def getLikes(self):
        base_url = 'https://api.tumblr.com/v2/blog/{}/likes?api_key={}&offset={}'
        #header = { 'User-Agent' : 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0'}
        header = {}
        with open(self.secretFile, 'r') as secretfile:
            api_key = secretfile.readline().strip('\n')
        if(not api_key):
            raise Exception("error opening secret file")
        offset = 0
        liked_urls = []
        while(True):
            url = base_url.format(self.blogName, api_key, offset)
            req = urllib.request.Request(url, None, headers=header)
            resp = urllib.request.urlopen(req)
            s = resp.read().decode()
            js = json.loads(s)['response']
            total_count = js['liked_count']
            posts = js['liked_posts']
            urls = [p['post_url'] for p in posts]
            liked_urls = liked_urls + urls
            offset = offset + len(urls)
            logging.warn('got {} likes'.format(offset))
            if (offset >= total_count or len(urls) == 0):
                break
        return liked_urls

Example #4

0

Show file

File: DownloadDaemon.py Project: hjk41/tumblrvideodownloader

class DownloadDaemon:
    def __init__(self, downloaddir, urlfile, dbdir = 'db', urlLoadInterval = 5, numDownloadThreads = 5, numRetriesPerUrl = 2):
        self.dbdir = dbdir
        if (not os.path.exists(self.dbdir)):
            os.makedirs(self.dbdir)
        self.downloaddir = downloaddir
        if (not os.path.exists(self.downloaddir)):
            os.makedirs(self.downloaddir)
        self.downloadedDB = HardDict(os.path.join(self.dbdir, "download.dat"))
        self.downloadedDBLock = threading.Lock()
        self.newDB = HardDict(os.path.join(self.dbdir, "new.dat"));
        self.newDBLock = threading.Lock()
        self.taskQueue = queue.Queue()
        self.finishQueue = queue.Queue()
        self.urlfile = urlfile
        self.urlLoadInterval = urlLoadInterval
        self.numDownloadThreads = numDownloadThreads
        self.numRetriesPerUrl = numRetriesPerUrl
        self.halt = False
        # clear partially downloaded files
        for u,p in self.newDB.items():
            self.newDB[u] = ''
        self.FlushDict()
    
    def __del__(self):
        logging.info("calling destructor of daemon")
        with self.downloadedDBLock:
            self.downloadedDB.close()
        with self.newDBLock:
            self.newDB.close()

    def FlushDict(self):
      self.newDB.flush()
      self.downloadedDB.flush()

    def MakeLocalPath(self, url):
        fname = url.split('/')[-1].strip("\r\n")
        fname = fname[0:min(len(fname) - 1, 20)]
        fname = fname + ".mp4"
        localpath = self.downloaddir + "/" + fname
        return localpath

    def loadNewUrls(self):
        try:
            # load existing urls
            with self.newDBLock:
                for u, p in self.newDB.items():
                    localpath = p
                    if (localpath == ''):
                        # make a local path
                        localpath = self.MakeLocalPath(u)
                    self.newDB[u] = localpath
                    self.taskQueue.put((u, localpath))
            while(True):
                try:
                    # dump likes into urlfile
                    proc = subprocess.Popen(["c:\\python27\\python.exe", \
                                            os.path.dirname(os.path.realpath(__file__)) + "\\dumplikes.py", \
                                            "secret.txt"],
                                            stdout=subprocess.PIPE)
                    out, err = proc.communicate()
                    # parse urls
                    for line in out.decode('utf-8').split('\n'):
                        line = line.strip()
                        if (line.startswith('url: ')):
                            url = line[len('url: '):]
                        else:
                            continue
                        url = url.strip("\r\n")
                        with self.downloadedDBLock:
                            if url in self.downloadedDB:
                                logging.info("url {} has already been downloaded".format(url))
                                continue
                        with self.newDBLock:
                            if url in self.newDB:
                                logging.info("url {} already in newDB".format(url))
                                continue
                            else:
                                logging.info("url {} inserted into newDB".format(url))
                                self.newDB[url] = ''
                                self.taskQueue.put((url, self.MakeLocalPath(url)))
                    # remove finished urls
                    try:
                        while(True):
                            item = self.finishQueue.get(timeout = 1)
                            with self.downloadedDBLock:
                                self.downloadedDB[item[0]] = item[1]
                            with self.newDBLock:
                                self.newDB.pop(item[0])
                    except(queue.Empty):
                        pass
                    self.FlushDict()
                except(KeyboardInterrupt, SystemExit):
                    raise
                except:
                    logging.warn("error occurred when loading urls")
                    traceback.print_exc()
                time.sleep(120)
        except:
            logging.warn("exception occurred in load url thread " + threading.currentThread().getName())
            traceback.print_exc()
        logging.warn("thread exiting... " + threading.currentThread().getName())          

    """
        download a url to local_path
    """
    def downloadOneUrl(self, url, local_path):
        tumblr.downloadTumblrVideo(url, local_path)
        if (os.stat(local_path).st_size < 10*1024):
            with open('errorurls.txt', 'a+') as f:
                f.write(url + u'\n')
            os.remove(local_path)
            raise IOError("file size too small, probably dredown error")

    """
        task thread
    """
    def taskThread(self):
        logging.info("thread " + threading.currentThread().getName() + " started as download thread")
        while(True):
            try:
                item = self.taskQueue.get()
                url = item[0]
                localpath = item[1]
                if (localpath == ''):
                    localpath = str(time.time()) + '.mp4'
                # start download
                logging.info("downloading {} to {}".format(url, localpath))
                success = False
                for i in range(0, self.numRetriesPerUrl):
                    try:
                        self.downloadOneUrl(url, localpath)
                        success = True
                        break
                    except(KeyboardInterrupt, SystemExit):
                        raise
                    except:
                        logging.warn("download of {0} failed on the {1} try".format(url, i))
                        traceback.print_exc()
                if (success):
                    logging.info("successfully downloaded {} to {}".format(url, localpath))
                    self.finishQueue.put((url, localpath))
                else:
                    logging.warn("failed when downloading {} to {}".format(url, localpath))
            except:
                logging.warn("exception occurred in thread "+ threading.currentThread().getName())
                traceback.print_exc()
        logging.warn("exiting task thread " + threading.currentThread().getName())
            
    """
        run forever
    """
    def run(self):
        self.downloadThreads = []
        for i in range(0, self.numDownloadThreads):
            t = threading.Thread(target = self.taskThread, daemon = True)
            t.start()
            self.downloadThreads.append(t)
        try:
            self.loadNewUrls()
        except (KeyboardInterrupt, SystemExit):
            logging.error("interruppted, now exiting...")
        except:
            logging.error("exception occurred in main thread")
            traceback.print_exc()