def download_file_parallel(url, target_path, show_progress=False, num_threads=1): """ Download the file from the given `url` and store it at `target_path`. Return a tuple x (url, bool, str). x[0] contains the url. If download failed x[1] is ``False`` and x[2] contains some error message. If download was fine x[1] is ``True`` and x[2] contains the target-path. """ downloader = Downloader(url, target_path, num_threads) downloader.start() if show_progress: # # Wait until we know file size # while downloader.total_length == 0: pass pbar = tqdm(total=downloader.total_length, desc='Download File', unit_scale=True) def update_pbar(x): pbar.update(x.total_downloaded - pbar.n) downloader.subscribe(update_pbar, 10) downloader.wait_for_finish() if show_progress: pbar.close() return (url, True, target_path)
def download_file_parallel(url, target_path, num_threads=1): """ Download the file from the given `url` and store it at `target_path`. Return a tuple x (url, bool, str). x[0] contains the url. If download failed x[1] is ``False`` and x[2] contains some error message. If download was fine x[1] is ``True`` and x[2] contains the target-path. """ downloader = Downloader(url, target_path, num_threads) downloader.start() # Wait until we know file size while downloader.total_length == 0: pass file_size = downloader.total_length logger.info('Download file from "%s" with size: %d B', url, file_size) bytes_at_last_log = 0 def callback(x): nonlocal bytes_at_last_log if x.total_downloaded - bytes_at_last_log >= PROGRESS_LOGGER_BYTE_DELAY: logger.info('Download [%06.2f%%]', x.total_downloaded / file_size * 100) bytes_at_last_log = x.total_downloaded downloader.subscribe(callback, 10) downloader.wait_for_finish() logger.info('Finished download') return (url, True, target_path)
def d(self, item, vpath, jpath, spider): #/usr/local/lib/python3.6/site-packages/pget h = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36', 'Referer': 'https://www.google.com/' } # proxies=spider.proxies downloader = Downloader(item['url_video'], vpath, chunk_count=0, high_speed=True, headers=h, proxies=spider.proxies) downloader.start() downloader.wait_for_finish() sec = self.check_video(vpath) if not sec: return with open(jpath, 'w+') as f: _t = int(time.time()) sig = "sc%7*g{}@!$%".format(_t) md = hashlib.md5() md.update(sig.encode(encoding='utf-8')) sign = md.hexdigest() # 2 post_dict = { 'time': _t, 'sig': sign, 'name': item['videoName'], 'area': 'us', 'cate': 'Beauty', 'year': 2019, 'director': '', 'actor': '', 'type': 'movie', 'total': 1, 'cover_url': item['name'] + '.jpg', 'grade': 2.0, 'mins': sec, 'source_url': item['name'] + '.mp4', 'resolution': item['resolution'], 'part': 1, 'intro': '' } json.dump(post_dict, f) spider.all_data.append(item['name']) write_data.open_data('dataList', item['name'] + '\n')
def download_file(url, target_path): downloader = Downloader(url, target_path, 8) downloader.start() while downloader.total_length == 0: pass pbar = tqdm(total=downloader.total_length, desc='Download File', unit_scale=True) def update_pbar(x): pbar.update(x.total_downloaded - pbar.n) downloader.subscribe(update_pbar, 10) downloader.wait_for_finish() pbar.close()
def readExtractedfiles(): #print getCodesInCSVsForAllDatasets(quandl_apikey) f_list = [] folderconvey = getCodesInCSVsForAllDatasets(quandl_apikey) for key in folderconvey.keys(): f_list.append(key) #print f_list q_data_base_URL = "https://www.quandl.com/api/v3/datasets/{0}" filenamesList = [] for (dirpath, dirnames, filenames) in walk(DEFAULT_DATA_PATH): filenamesList.extend(filenames) try: for fn in filenamesList: print fn try: dataset_qcodes = [] logging.info(fn + " extracted.") codesFile = os.path.abspath(os.path.join( DEFAULT_DATA_PATH, fn)) with open(codesFile, 'r') as csv_file: csvlines = csv_file.readlines() for num, line in enumerate(csvlines[:5]): codeline = line.split(',') if len(codeline) > 1: dataset_code = codeline[0] dataset_descrpn = codeline[1] download_url = q_data_base_URL.format(dataset_code) data_URL = download_url + "?api_key=" + quandl_apikey time.sleep(1) resp = os.popen("curl " + data_URL) resp_data = resp.read() json_data = json.loads(resp_data) #folderconvey = getCodesInCSVsForAllDatasets(quandl_apikey) foldername = json_data["dataset"]["name"] dat_code = json_data["dataset"]["database_code"] #foldername = (foldername.replace('-', '').replace(' ', '_')).lower() foldername = re.sub("[^A-Za-z0-9 ]+", "", foldername) foldername = re.sub(" +", " ", foldername).replace( " ", "_").lower() print ">>>>>>>" + foldername for name in f_list: if name == dat_code: out_fldr_name = folderconvey[name] out_fldr_name = re.sub( "[^A-Za-z0-9 ]+", "", out_fldr_name) out_fldr_name = re.sub( " +", " ", out_fldr_name).replace(" ", "_").lower() try: os.chdir(rootfolder) if not os.path.isdir(out_fldr_name): os.mkdir(out_fldr_name) os.chdir(out_fldr_name) if not os.path.isdir(foldername): os.mkdir(foldername) os.chdir(foldername) except WindowsError: continue fileformat = ".csv" if not os.path.isfile( dataset_code.split('/')[1] + '-datasets-codes' + fileformat): urll = download_url + "/data.csv" downloader = Downloader( urll, dataset_code.split('/')[1] + fileformat, 8) downloader.start() downloader.wait_for_finish() except: raise continue except: pass
def LLloop(): LLqueue = open(os.path.dirname(__file__) + '/cgi-bin/LLprogress', "r+", encoding='utf-8') lines = LLqueue.read() item = lines.split('\n', 1)[0] if len(item) > 10: LL_item = json.loads(item) url = LL_item[0] filename = 'files' + LL_item[1] + '/' + os.path.basename(LL_item[0]) chunk = int(LL_item[2]) type = int(LL_item[3]) print('Start Downloading [' + url + ']') if type == 1: downloader = Downloader(url, filename, chunk) downloader.start() downloader.wait_for_finish() if type == 2: LLcurl = 'curl -k ' + url + ' --output ' + filename os.system(LLcurl) datastring = '["' + url + '"' + ',' + '"' + filename + '"' + ',' + '"' + LL_item[ 2] + '"' + ',' + '"' + LL_item[3] + '"' + "]\n" if os.path.isfile(filename): LLqueue = open(os.path.dirname(__file__) + '/cgi-bin/LLresult', "a", encoding='utf-8') LLqueue.write(datastring) LLqueue.close() else: LLqueue = open(os.path.dirname(__file__) + '/cgi-bin/LLerrors', "a", encoding='utf-8') LLqueue.write(datastring) LLqueue.close() print('Checking for next ...') else: print('Waiting mode ...') LLqueue = open(os.path.dirname(__file__) + '/cgi-bin/LLqueue', "r+", encoding='utf-8') lines = LLqueue.read() new_item = lines.split('\n', 1)[0] if len(new_item) > 10: LLprogress = open(os.path.dirname(__file__) + '/cgi-bin/LLprogress', "w", encoding='utf-8') LLprogress.write(new_item + "\n") LLprogress.close() LLqueue.close() LLqueue_old = open(os.path.dirname(__file__) + '/cgi-bin/LLqueue', "r+", encoding='utf-8') LLqueue_old.readline() new_LLqueue = open(os.path.dirname(__file__) + '/cgi-bin/LLqueue', "w", encoding='utf-8') shutil.copyfileobj(LLqueue_old, new_LLqueue) else: LLprogress = open(os.path.dirname(__file__) + '/cgi-bin/LLprogress', "w", encoding='utf-8') LLprogress.write('' + "\n") LLprogress.close() print('No new Item !') time.sleep(1)
fileformat = ".csv" filename = re.sub("[^A-Za-z0-9 ]", "", name).lower().replace( " ", "_") + fileformat if original_url.split(".")[-1] == 'csv': os.chdir(rootfolder) if not os.path.isdir(filename): os.mkdir(filename) os.chdir(filename) if not os.path.isfile(filename): downloader = Downloader(original_url, filename, 8) downloader.start() print "came here" print "downloading file " + filename downloader.wait_for_finish() '''if original_url.split(".")[-1] == 'zip': #url = urllib.urlopen(original_url) #zip_file = ZipFile(StringIO(url.read())) #files = zipfile.namelist() #fopen = open(filename+'.csv', 'w') #zipcontent = url.read() downloader = Downloader(original_url, filename+".zip", 8) downloader.start() downloader.wait_for_finish() print "comitted here" with zipfile.ZipFile(filename+".zip", "r") as zfr: zfr.extractall(filename) os.chdir(rootfolder) os.remove(filename+".zip")'''
def extractFromJSON(domain, datasets_colln): datasets_cursor = datasets_colln.find() print(datasets_cursor) #files_download = 10 for dataset in datasets_cursor: """if files_download == 0: break files_download = files_download - 1""" dataset_name = dataset["name"] print(">>>> " + dataset_name) available_formats = {} res_format = None res_urls = {} other_formats = {} for i, res in enumerate(dataset["resources"]): available_formats[i] = { "format": res["format"], "url" : res["url"], "filename" : res["id"] } #print(available_formats) for a in available_formats.values(): if 'JSONL' in a["format"]: res_format = 'JSONL' break elif 'jsonl' in a["format"]: res_format = 'jsonl' break elif 'CSV' in a["format"]: res_format = 'CSV' break elif 'csv' in a["format"]: res_format = 'csv' break elif 'JSON' in a["format"]: res_format = 'JSON' break elif 'json' in a["format"]: res_format = 'json' break """elif 'XLS' in a["format"]: res_format = 'XLS' break elif 'xls' in a["format"]: res_format = 'xls' break else: other_formats[a["url"]] = a["filename"] with open("fileformats.txt", "a+") as fileformats: fileformats.write(a["filename"] + ',' + a["format"] + ',' \ + a["url"] + '\n')""" if res_format is None: continue #res_urls = other_formats else: #continue for a in available_formats.values(): if res_format in a["format"]: res_urls[a["url"]] = a["filename"] os.chdir(root_folder) if not os.path.isdir(dataset_name): os.mkdir(dataset_name) os.chdir(dataset_name) print(str(res_format) + " :: " + str(len(res_urls))) for res_url in res_urls.keys(): if res_format is None: file_name = res_urls[res_url] else: file_name = res_urls[res_url] + "." + (res_format).lower() print("Downloading... " + file_name) #print("... from >> " + res_url) try: if not os.path.isfile(file_name): sleep(1) downloader = Downloader(res_url, file_name, 8) downloader.start() downloader.wait_for_finish() """resp = urllib.request.urlopen(res_url) resp_content = resp.read() print("Writing...") with open(file_name, 'wb') as res_file: res_file.write(resp_content)""" except: print("Error @ " + dataset_name) continue