def __init__(self, downloaddir, blogName, secretFile, dbdir = 'db', numRetriesPerUrl = 2): self.dbdir = dbdir if (not os.path.exists(self.dbdir)): os.makedirs(self.dbdir) self.downloaddir = downloaddir if (not os.path.exists(self.downloaddir)): os.makedirs(self.downloaddir) self.downloadedDB = HardDict(os.path.join(self.dbdir, "download.dat")) self.numRetriesPerUrl = numRetriesPerUrl self.blogName = blogName self.secretFile = secretFile
def __init__(self, downloaddir, urlfile, dbdir = 'db', urlLoadInterval = 5, numDownloadThreads = 5, numRetriesPerUrl = 2): self.dbdir = dbdir if (not os.path.exists(self.dbdir)): os.makedirs(self.dbdir) self.downloaddir = downloaddir if (not os.path.exists(self.downloaddir)): os.makedirs(self.downloaddir) self.downloadedDB = HardDict(os.path.join(self.dbdir, "download.dat")) self.downloadedDBLock = threading.Lock() self.newDB = HardDict(os.path.join(self.dbdir, "new.dat")); self.newDBLock = threading.Lock() self.taskQueue = queue.Queue() self.finishQueue = queue.Queue() self.urlfile = urlfile self.urlLoadInterval = urlLoadInterval self.numDownloadThreads = numDownloadThreads self.numRetriesPerUrl = numRetriesPerUrl self.halt = False # clear partially downloaded files for u,p in self.newDB.items(): self.newDB[u] = '' self.FlushDict()
class DownloadLikes: def __init__(self, downloaddir, blogName, secretFile, dbdir = 'db', numRetriesPerUrl = 2): self.dbdir = dbdir if (not os.path.exists(self.dbdir)): os.makedirs(self.dbdir) self.downloaddir = downloaddir if (not os.path.exists(self.downloaddir)): os.makedirs(self.downloaddir) self.downloadedDB = HardDict(os.path.join(self.dbdir, "download.dat")) self.numRetriesPerUrl = numRetriesPerUrl self.blogName = blogName self.secretFile = secretFile def close(self): logging.info("closing...") self.downloadedDB.close() def FlushDict(self): self.downloadedDB.flush() def MakeLocalPath(self, url): fname = url.split('/')[-1].strip("\r\n") fname = fname[0:min(len(fname) - 1, 20)] fname = fname + ".mp4" localpath = self.downloaddir + "/" + fname return localpath def Download(self): try: # get liked urls liked = self.getLikes() for url in liked: if url in self.downloadedDB: logging.info("url {} has already been downloaded".format(url)) continue logging.warn("downloading {}".format(url)) success = False for i in range(self.numRetriesPerUrl): try: self.downloadOneUrl(url, self.MakeLocalPath(url)) self.downloadedDB[url] = url success = True break except KeyboardInterrupt as e: raise e except: logging.warn("error occurred on {}th downloading from {}".format(i, url)) if not success: with open('failed.txt', 'w') as f: f.write(url) except(KeyboardInterrupt, SystemExit): raise except: logging.warn("error occurred when loading urls") traceback.print_exc() """ download a url to local_path """ def downloadOneUrl(self, url, local_path): tumblr.downloadTumblrVideo(url, local_path) def getLikes(self): base_url = 'https://api.tumblr.com/v2/blog/{}/likes?api_key={}&offset={}' #header = { 'User-Agent' : 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0'} header = {} with open(self.secretFile, 'r') as secretfile: api_key = secretfile.readline().strip('\n') if(not api_key): raise Exception("error opening secret file") offset = 0 liked_urls = [] while(True): url = base_url.format(self.blogName, api_key, offset) req = urllib.request.Request(url, None, headers=header) resp = urllib.request.urlopen(req) s = resp.read().decode() js = json.loads(s)['response'] total_count = js['liked_count'] posts = js['liked_posts'] urls = [p['post_url'] for p in posts] liked_urls = liked_urls + urls offset = offset + len(urls) logging.warn('got {} likes'.format(offset)) if (offset >= total_count or len(urls) == 0): break return liked_urls
class DownloadDaemon: def __init__(self, downloaddir, urlfile, dbdir = 'db', urlLoadInterval = 5, numDownloadThreads = 5, numRetriesPerUrl = 2): self.dbdir = dbdir if (not os.path.exists(self.dbdir)): os.makedirs(self.dbdir) self.downloaddir = downloaddir if (not os.path.exists(self.downloaddir)): os.makedirs(self.downloaddir) self.downloadedDB = HardDict(os.path.join(self.dbdir, "download.dat")) self.downloadedDBLock = threading.Lock() self.newDB = HardDict(os.path.join(self.dbdir, "new.dat")); self.newDBLock = threading.Lock() self.taskQueue = queue.Queue() self.finishQueue = queue.Queue() self.urlfile = urlfile self.urlLoadInterval = urlLoadInterval self.numDownloadThreads = numDownloadThreads self.numRetriesPerUrl = numRetriesPerUrl self.halt = False # clear partially downloaded files for u,p in self.newDB.items(): self.newDB[u] = '' self.FlushDict() def __del__(self): logging.info("calling destructor of daemon") with self.downloadedDBLock: self.downloadedDB.close() with self.newDBLock: self.newDB.close() def FlushDict(self): self.newDB.flush() self.downloadedDB.flush() def MakeLocalPath(self, url): fname = url.split('/')[-1].strip("\r\n") fname = fname[0:min(len(fname) - 1, 20)] fname = fname + ".mp4" localpath = self.downloaddir + "/" + fname return localpath def loadNewUrls(self): try: # load existing urls with self.newDBLock: for u, p in self.newDB.items(): localpath = p if (localpath == ''): # make a local path localpath = self.MakeLocalPath(u) self.newDB[u] = localpath self.taskQueue.put((u, localpath)) while(True): try: # dump likes into urlfile proc = subprocess.Popen(["c:\\python27\\python.exe", \ os.path.dirname(os.path.realpath(__file__)) + "\\dumplikes.py", \ "secret.txt"], stdout=subprocess.PIPE) out, err = proc.communicate() # parse urls for line in out.decode('utf-8').split('\n'): line = line.strip() if (line.startswith('url: ')): url = line[len('url: '):] else: continue url = url.strip("\r\n") with self.downloadedDBLock: if url in self.downloadedDB: logging.info("url {} has already been downloaded".format(url)) continue with self.newDBLock: if url in self.newDB: logging.info("url {} already in newDB".format(url)) continue else: logging.info("url {} inserted into newDB".format(url)) self.newDB[url] = '' self.taskQueue.put((url, self.MakeLocalPath(url))) # remove finished urls try: while(True): item = self.finishQueue.get(timeout = 1) with self.downloadedDBLock: self.downloadedDB[item[0]] = item[1] with self.newDBLock: self.newDB.pop(item[0]) except(queue.Empty): pass self.FlushDict() except(KeyboardInterrupt, SystemExit): raise except: logging.warn("error occurred when loading urls") traceback.print_exc() time.sleep(120) except: logging.warn("exception occurred in load url thread " + threading.currentThread().getName()) traceback.print_exc() logging.warn("thread exiting... " + threading.currentThread().getName()) """ download a url to local_path """ def downloadOneUrl(self, url, local_path): tumblr.downloadTumblrVideo(url, local_path) if (os.stat(local_path).st_size < 10*1024): with open('errorurls.txt', 'a+') as f: f.write(url + u'\n') os.remove(local_path) raise IOError("file size too small, probably dredown error") """ task thread """ def taskThread(self): logging.info("thread " + threading.currentThread().getName() + " started as download thread") while(True): try: item = self.taskQueue.get() url = item[0] localpath = item[1] if (localpath == ''): localpath = str(time.time()) + '.mp4' # start download logging.info("downloading {} to {}".format(url, localpath)) success = False for i in range(0, self.numRetriesPerUrl): try: self.downloadOneUrl(url, localpath) success = True break except(KeyboardInterrupt, SystemExit): raise except: logging.warn("download of {0} failed on the {1} try".format(url, i)) traceback.print_exc() if (success): logging.info("successfully downloaded {} to {}".format(url, localpath)) self.finishQueue.put((url, localpath)) else: logging.warn("failed when downloading {} to {}".format(url, localpath)) except: logging.warn("exception occurred in thread "+ threading.currentThread().getName()) traceback.print_exc() logging.warn("exiting task thread " + threading.currentThread().getName()) """ run forever """ def run(self): self.downloadThreads = [] for i in range(0, self.numDownloadThreads): t = threading.Thread(target = self.taskThread, daemon = True) t.start() self.downloadThreads.append(t) try: self.loadNewUrls() except (KeyboardInterrupt, SystemExit): logging.error("interruppted, now exiting...") except: logging.error("exception occurred in main thread") traceback.print_exc()