def download_from_urls(data): filenames = [] for obj in data["StandardData"]: fname = obj["urls"][0].split('/')[-1] download.download_file(obj["urls"][0], "./data/download/"+fname) filenames.append(fname) return filenames
def down(): #file = 'https://s3.amazonaws.com/google-landmark/metadata/train.csv' file = 'https://s3.amazonaws.com/google-landmark/metadata/train_attribution.csv' download_file(file, './train_attribution.csv')
def Run(): # if len(sys.argv) != 3: # print('Syntax: %s <data_file.csv> <output_dir/>' % sys.argv[0]) # sys.exit(0) # (data_file, out_dir) = sys.argv[1:] data_file = './test_csv/test.csv' #out_dir = './test_images' if not os.path.exists(out_dir): os.mkdir(out_dir) key_url_list = ParseData(data_file) for key, url in tqdm.tqdm(key_url_list): filename = os.path.join(out_dir, '%s.jpg' % key) if os.path.exists(filename): print('Image %s already exists. Skipping download.' % filename) continue for ti in range(5): try: download_file(url, filename) break except Exception as e: # file = 'data/train_images/' + img if os.path.exists(filename): os.remove(filename) time.sleep(10) time.sleep(3)
def download_all(url, path, prefix, ext=".mp3"): """Pobiera wszystkie podcasty Argumenty: url - adres do rss path - ścieżka do zapisu plików prefix - początek nazwy plku ext - rozszerzenie pliku (chyba zawsze .mp3) Następnie modyfikuje tagi mp3: - zmienia tag album na miesiąc i rok - zmiena tag title na datę podcastu Przy pobieraniu plików korzysta z modułu download, który wyświetla także pasek postępu. """ try: usock = urllib2.urlopen(url) except IOError: print("Nieprawidłowy adres lub błąd połączenia") sys.exit(1) parser = TokFmPodcastsParser(usock) data = parser.get_all() for d in data: filename = prefix + "-" + "_".join(d["date"]) + ext file_path = os.path.join(path, filename) if os.path.isfile(file_path): print("{0} istnieje, pomijam go".format(filename)) else: print("{0}".format(filename)) download.download_file(d["url"], file_path) # zmieniam tag album na miesiąc i rok, ustawiam # tytuł na datę podcastu title = ".".join(d["date"]) album = ".".join(d["date"][1:]) edit_id3(file_path, title, album)
def download_from_urls(data): filenames = [] for obj in data["StandardData"]: fname = obj["urls"][0].split('/')[-1] download.download_file(obj["urls"][0], "./data/download/" + fname) filenames.append(fname) return filenames
def get_file(self): if self.temp_file == None: fd, self.temp_file = tempfile.mkstemp('_e621dl.' + self.file_ext, str(self.id)) with open(self.temp_file, 'w') as temp: download.download_file(self.file_url, temp) os.close(fd) return self.temp_file
def get_preview(self): if self.temp_preview == None: extension = self.preview_url[::-1].split('.')[0][::-1] fd, self.temp_preview = tempfile.mkstemp('_e621dl.' + extension, str(self.id)) with open(self.temp_preview, 'w') as temp: download.download_file(self.preview_url, temp) os.close(fd) return self.temp_preview
def get_hubble(image_id, ext='jpeg'): status_ready = None response = requests.get( 'http://hubblesite.org/api/v3/image/{}'.format(image_id)) data = response.json()['image_files'] image_name = "Hubble-{}.{}".format(image_id, ext) loading_list = [ image['file_url'] for image in data if image['file_url'].endswith(ext) ] for picture_url in loading_list: download_file(picture_url, image_name)
def idempotent_download(self, path) -> bool: name = self.file_type + ('.zip' if self.zipped else '') file_path = path / name if not file_path.exists(): log.info('Downloading new file `%s` from URL `%s`', file_path, self.to_url()) try: download_file(self.to_url(), file_path) return True except Exception: log.warning('Failed to download file `%s` from URL `%s`', exc_info=True) return False else: log.info('Skipping download of file `%s`', file_path) return True
def _download_data(state): ''' Download census data for the given state and unzip the file into the expected directory. ''' zipped = download.download_file(_ftp_path(state), state.name) with zipfile.ZipFile(zipped) as z: z.extractall(_get_path(state))
def _retrieve_uniprot_file(uniprot_acc, download_dir): """ Download a Uniprot file by accession. """ src = uniprot_url % uniprot_acc src = db_fetch_url % uniprot_acc dst = os.path.join(download_dir, '%s.txt' % uniprot_acc) dst = download_file(src, dst) return dst
def get_us_county_shapefiles(): filepath = CENSUS_DATA_PATH / 'tl_2018_us_county' / 'tl_2018_us_county.shp' if not filepath.exists(): # download from internet shape_link = 'https://www2.census.gov/geo/tiger/TIGER2018/COUNTY/'\ +'tl_2018_us_county.zip' zipped = download.download_file(shape_link, "US Counties") with zipfile.ZipFile(zipped) as z: z.extractall(filepath.parent) gdf = gpd.read_file('census_data/tl_2018_us_county/tl_2018_us_county.shp') return gdf
def download_current(url, path, prefix, ext=".mp3"): """Pobiera aktualny podcast""" try: usock = urllib2.urlopen(url) except IOError: print("Nieprawidłowy adres lub błąd połączenia") sys.exit(1) parser = TokFmPodcastsParser(usock) data = parser.get_current() filename = prefix + "-" + "_".join(data["date"]) + ext file_path = os.path.join(path, filename) if os.path.isfile(file_path): print("{0} istnieje, pomijam go".format(filename)) else: print("{0}".format(filename)) download.download_file(data["url"], file_path) # zmieniam tag album na miesiąc i rok, ustawiam # tytuł na datę podcastu title = ".".join(data["date"]) album = ".".join(data["date"][1:]) edit_id3(file_path, title, album)
def _batch_retrieve_uniprot_files(uniprot_accessions, download_dir, dataset_prefix, initial_batch_size=120, sleep_interval=15): """ Download Uniprot files in batches """ N = len(uniprot_accessions) batch_size = initial_batch_size i = 0 # downloaded records counter j = 0 # number of downloaded files # expected number of total files to be downloaded M = N // batch_size + (1 if N % batch_size else 0) entries = [] while i < N: while True: repeat_download = False ids = ','.join(uniprot_accessions[i: i + batch_size]) src = db_fetch_url % ids fn = '%s_uniprot_batch_%s.txt' % (dataset_prefix, _get_ids_hash(ids)) dst = os.path.join(download_dir, fn) if not os.path.exists(dst): print " Downloading file: %s (%d/%d)" % (fn, j+1, M) try: dst = download_file(src, dst) time.sleep(sleep_interval) # wait a little except Exception: print " FAILURE" if batch_size > 1: batch_size = batch_size // 2 print " Halving batch size to %d." % batch_size repeat_download = True else: print " Skipping %s." % ids if not repeat_download: entries += _process_uniprot_file(dst) i += batch_size batch_size = initial_batch_size r = 1 if (N-i) % batch_size else 0 j += 1 M = j + (N-i) // batch_size + r break acc_map = {} for ac, gn, orgn, taxid, pe, geneids in entries: for acc in ac: if acc in uniprot_accessions: acc_map[acc] = (gn, orgn, taxid, pe, geneids) return acc_map
def download_and_extract_biogrid(src, download_dir): """ Download and extract a BioGRID dataset (zipped). """ pathname = urllib.url2pathname(src) biogrid_zipfile = os.path.basename(urlparse.urlsplit(pathname)[2]) zipfile_path = os.path.join(download_dir, biogrid_zipfile) if not os.path.exists(zipfile_path): print "Downloading latest BioGRID." download_file(src, zipfile_path) zf = zipfile.ZipFile(zipfile_path, 'r', allowZip64=True) biogrid_file = _get_archived_filename(zf) extracted_path = os.path.join(download_dir, biogrid_file) if not os.path.exists(extracted_path): print "Extracting latest BioGRID." out_fp = open(extracted_path, 'w') out_fp.write(zf.read(biogrid_file)) out_fp.close() zf.close() return extracted_path
def JC_StrokeCetification(path=RAW_DATA / 'StrokeCertificationList.xlsx'): JC_URL = "https://www.qualitycheck.org/file.aspx?FolderName=" + "StrokeCertification&c=1" if not path.exists(): download.download_file(JC_URL, 'Joint Commission', savedir=path) return pd.read_excel(path, dtype=str)
def hh_method(self, num): todir = self.save_path_entry.get() font_dir = self.check_bdo_dir() tmpdirname = str(gettempdir()) ads_dir = todir + '\\ads' temp_loc_dir = tmpdirname + r'\split_loc' temp_font_dir = tmpdirname + r'\split_fonts' temp_bdocn_dir = tmpdirname + r'\bdocn_temp' tw_loc = 'http://dn.blackdesert.com.tw/UploadData/ads/languagedata_tw.loc' github_loc = 'https://github.com/BDO-CnHope/bdocn/raw/master/ads/languagedata_en.loc' github_font = 'https://github.com/BDO-CnHope/bdocn/raw/master/prestringtable/font/pearl.ttf' gitee_loc = 'https://gitee.com/bdo-cnhope/bdocn/tree/master/split/' gitee_font = 'https://gitee.com/bdo-cnhope/bdocn/tree/master/split_font/' en_loc_zip = download.download_en_loc() try: if exists(temp_loc_dir) == False: mkdir(temp_loc_dir) elif exists(temp_font_dir) == False: mkdir(temp_font_dir) elif exists(temp_bdocn_dir) == False: mkdir(temp_bdocn_dir) except: self.insert_text('操作错误,请重试...code: 1 \n') pass else: if exists(temp_loc_dir) == False: mkdir(temp_loc_dir) elif exists(temp_font_dir) == False: mkdir(temp_font_dir) elif exists(temp_bdocn_dir) == False: mkdir(temp_bdocn_dir) try: if num == 1: if check_new.get_loc_hash(1) != self.check_loc_hash(): self.insert_text('正在使用国外线路下载简体汉化语言包…… \n') download.download_file(github_loc, ads_dir, 'languagedata_en.loc') self.insert_text('简体汉化包已更新! \n') else: self.insert_text('简体汉化包已是最新的了! \n') if check_new.get_font_hash(1) != self.check_font_hash(): self.insert_text('正在下载字体包…… \n') download.download_file(github_font, font_dir, 'pearl.ttf') self.insert_text('字体包已更新! \n') else: self.insert_text('字体包已是最新的了! \n') showinfo('提示', '汉化已完成!') elif num == 2: self.insert_text('正在下载繁体汉化语言包…… \n') download.download_file(tw_loc, ads_dir, 'languagedata_en.loc') self.insert_text('繁体汉化包已更新! \n') if check_new.get_font_hash(1) != self.check_font_hash(): self.insert_text('正在下载字体包…… \n') download.download_file(github_font, font_dir, 'pearl.ttf') self.insert_text('字体包已更新! \n') else: self.insert_text('字体包已是最新的了! \n') showinfo('提示', '汉化已完成!') elif num == 3: if check_new.get_font_hash(1) != self.check_font_hash(): self.insert_text('正在下载字体包…… \n') download.download_file(github_font, font_dir, 'pearl.ttf') self.insert_text('字体包已更新! \n') else: self.insert_text('字体包已是最新的了! \n') showinfo('提示', '汉化已完成!') elif num == 11: if check_new.get_loc_hash(2) != self.check_loc_hash(): self.insert_text('正在使用国内线路下载简体汉化语言包…… \n') download.download_split_files(gitee_loc, temp_loc_dir) joinfiles.join_files(temp_loc_dir, ads_dir, 'languagedata_en.loc') self.insert_text('简体汉化包已更新! \n') else: self.insert_text('简体汉化包已是最新的了! \n') if check_new.get_font_hash(2) != self.check_font_hash(): self.insert_text('正在下载字体包…… \n') download.download_split_files(gitee_font, temp_font_dir) joinfiles.join_files(temp_font_dir, font_dir, 'pearl.ttf') self.insert_text('字体包已更新! \n') else: self.insert_text('字体包已是最新的了! \n') showinfo('提示', '汉化已完成!') elif num == 12: self.insert_text('正在下载繁体汉化语言包…… \n') download.download_file(tw_loc, ads_dir, 'languagedata_en.loc') self.insert_text('繁体汉化包已更新! \n') if check_new.get_font_hash(2) != self.check_font_hash(): self.insert_text('正在下载字体包…… \n') download.download_split_files(gitee_font, temp_font_dir) joinfiles.join_files(temp_font_dir, font_dir, 'pearl.ttf') self.insert_text('字体包已更新! \n') else: self.insert_text('字体包已是最新的了! \n') showinfo('提示', '汉化已完成!') elif num == 13: if check_new.get_font_hash(2) != self.check_font_hash(): self.insert_text('正在下载字体包…… \n') download.download_split_files(gitee_font, temp_font_dir) joinfiles.join_files, (temp_font_dir, font_dir, 'pearl.ttf') self.insert_text('字体包已更新! \n') else: self.insert_text('字体包已是最新的了! \n') showinfo('提示', '汉化已完成!') elif num == 4: self.insert_text('正在重新安装美服英语包…… \n') unzip_dir = temp_bdocn_dir + '\\loc' download.download_file(en_loc_zip, temp_bdocn_dir, 'BDOLanguage.zip') unzip.un_zip(temp_bdocn_dir, 'BDOLanguage.zip', unzip_dir) copy(unzip_dir + '\\' + 'languagedata_en.loc', ads_dir) self.insert_text('已恢复为美服英语! \n') showinfo('提示', '任务已完成!') except: self.insert_text('操作错误,请重试...code: 2 \n') if exists(temp_loc_dir) == True: rmtree(temp_loc_dir) elif exists(temp_font_dir) == True: rmtree(temp_font_dir) elif exists(temp_bdocn_dir) == True: rmtree(temp_bdocn_dir)
def sub2save(name, dest, sm): return dest, sm[0], sm[1], name, download_file(sm[1])
DATABASE_NAME = os.path.join(__location__, 'data.sqlite') conn = sqlite3.connect(DATABASE_NAME) # city of zurich - start url start_url = 'https://www.stadt-zuerich.ch/ssd/de/index/volksschule/schulferien.html' # page for each year content = dl.download_content(start_url) soup = BeautifulSoup(content, 'html.parser') nav = soup.find('li', {'class': 'var_wrapping_node var_active'}) pages = nav.find_all('a', string=re.compile(r'^\d{4}/\d{2}$')) for page in pages: year_href = page.get('href') year_url = urljoin(start_url, year_href) download_url = get_ics_download_url(year_url) filename = os.path.basename(download_url) file_path = os.path.join(__location__, filename) dl.download_file(download_url, file_path) print(f"Download URL: {download_url}") events = parse_ics.parse_file(file_path) insert_or_update(events, conn) conn.commit() except Exception as e: print("Error: %s" % e) print(traceback.format_exc()) raise finally: conn.close()
import pymysql.cursors, time import download,os db = pymysql.connect("localhost", 'root', 'yuwenque', 'keep', charset='utf8mb4') cursor = db.cursor(cursor=pymysql.cursors.DictCursor) sql =''' select avatar,gender,userid from keep_user_info where gender ='F' and birthday like '199%' or birthday like '200%' or birthday like '201%' limit 5001,10000 ''' path = '/Users/yuwenque/Downloads/keepuser/' cursor.execute(sql) list = cursor.fetchall() for i in range(0, len(list)): item = list[i] photo_name = path + item['gender'] + "/" + item['userid'] + ".jpg" if i % 10 == 0 and i != 0: time.sleep(10) try: if not os.path.exists(photo_name): try: download.download_file(item['avatar'], photo_name) except Exception as e2: os.remove(photo_name) print(e2) time.sleep(3) except Exception as e: print(e)
def hack_paper(): status = checkCronJob.checkCronStatus() print status if(status == 0): print "JOB's already done" return todaysDate = Cur_date.getCurDate() pdf_docs = [] pages = get_pages.getPages()+1 # pages = 2 dir_path = os.path.dirname(os.path.realpath(__file__)) for pageno in xrange(1,pages): for city in ['smt','mdb','bgh']: url = "http://epaper.jagran.com/epaperimages/"+todaysDate+"/muzaffarpur/"+str(Cur_date.getPrevDayDate())+city+"-pg"+ str(pageno) +"-0.pdf" # url = "http://epaper.jagran.com/epaperimages/"+"26012018"+"/muzaffarpur/"+"25"+city+"-pg"+ str(pageno) +"-0.pdf" print url ##sending file path ## file path also contains the file name of the downloaded file file_path = dir_path + "/" + str(pageno)+".pdf" print "Downloading...page no = ", pageno download.download_file(url,file_path) flag = pdf_merger.check_valid_pdf(file_path) if(flag == 0): pdf_docs.append(file_path) break #As soon as it gets a valid pdf add to the list 'pdf_docs' else skip else: os.remove(file_path) print "PAGE NO",pageno,"with city =", city, "DONT EXIST" # pdf_docs.append(file_path) final_file_path = dir_path + "/" + todaysDate+".pdf" pdf_merger.FileMerger(pdf_docs, final_file_path) subject = "epaper dated "+ todaysDate # file_path = dir_path + "/" + final_file_name ###for qpython -- files download in this directory # cd_dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) # file_path = cd_dir_path + "/" + final_file_name try: print "SENDING EMAIL..............." send_email.send_mail(configg.fromaddr,configg.password,configg.toaddr,subject,todaysDate+".pdf",final_file_path) pdf_docs.append(final_file_path) Delete_Files.del_files(pdf_docs) ##updating cron Flag file when the job is done for the day with open('/home/gugli/Documents/script_py/Dainik_Jagron/checkCronStatus.txt','w') as outFile: outFile.write( Cur_date.strfTime()) except Exception as e: Delete_Files.del_files(pdf_docs) print "COULDNOT SEND MAIL...." print e
gdf = get_us_county_shapefiles() # Summary files manually selected and downloaded from American Fact Finder SF2010_DOWNLOADED_PATH = CENSUS_DATA_PATH / 'SF2010_zips' SF2010_DOWNLOADED_FILENAME = 'DEC_10_SF1_P12_with_ann.csv' SF2010_OUTPATH = CENSUS_DATA_PATH / 'SF2010' # One iteration for each state state_abbrs = ['NH', 'NJ', 'CT', 'RI', 'ME', 'VT'] for state_abbr in state_abbrs: state_fip = us.states.lookup(state_abbr).fips # Download shapefile for this state, block level: shape_link = 'https://www2.census.gov/geo/tiger/TIGER2018/TABBLOCK/'\ + f'tl_2018_{state_fip}_tabblock10.zip' print(shape_link) zipped = download.download_file(shape_link, state_abbr) with zipfile.ZipFile(zipped) as z: z.extractall(SHAPEFILE_OUTPATH) # 2 diff name format depends on which state so just get both sfpaths = [SF2010_DOWNLOADED_PATH.glob(f'{state_abbr}_download_?')] sfpaths += [SF2010_DOWNLOADED_PATH.glob(f'SF2010_{state_abbr}_?')] dflist = [] for i, sfp in enumerate(sfpaths): if i > 0: dflist.append( pd.read_csv(sfp / SF2010_DOWNLOADED_FILENAME, dtype=str, skiprows=[1])) else:
def import_posts(key, url='https://api.fanbox.cc/post.listSupporting?limit=50'): conn = psycopg2.connect(host=config.database_host, dbname=config.database_dbname, user=config.database_user, password=config.database_password, cursor_factory=RealDictCursor) scraper_data = requests.get(url, cookies={ 'FANBOXSESSID': key }, headers={ 'origin': 'https://fanbox.cc' }, proxies=get_proxy()).json() if scraper_data.get('body'): for post in scraper_data['body']['items']: parsed_post = FanboxPost(post['id'], None, post) if parsed_post.is_restricted: continue try: file_directory = f"files/fanbox/{post['user']['userId']}/{post['id']}" attachments_directory = f"attachments/fanbox/{post['user']['userId']}/{post['id']}" cursor1 = conn.cursor() cursor1.execute( "SELECT * FROM dnp WHERE id = %s AND service = 'fanbox'", (post['user']['userId'], )) bans = cursor1.fetchall() if len(bans) > 0: continue check_for_flags('fanbox', post['user']['userId'], post['id']) cursor2 = conn.cursor() cursor2.execute( "SELECT * FROM booru_posts WHERE id = %s AND service = 'fanbox'", (post['id'], )) existing_posts = cursor2.fetchall() if len(existing_posts) > 0: continue post_model = { 'id': post['id'], '"user"': post['user']['userId'], 'service': 'fanbox', 'title': post['title'], 'content': parsed_post.body_text, 'embed': {}, 'shared_file': False, 'added': datetime.datetime.now(), 'published': post['publishedDatetime'], 'edited': post['updatedDatetime'], 'file': {}, 'attachments': [] } for i in range(len(parsed_post.embeddedFiles)): if i == 0: filename, _ = download_file( join(config.download_path, file_directory), parsed_post.embeddedFiles[i], cookies={'FANBOXSESSID': key}, headers={'origin': 'https://fanbox.cc'}) post_model['file']['name'] = filename post_model['file'][ 'path'] = f'/{file_directory}/{filename}' else: filename, _ = download_file( join(config.download_path, attachments_directory), parsed_post.embeddedFiles[i], cookies={'FANBOXSESSID': key}, headers={'origin': 'https://fanbox.cc'}) post_model['attachments'].append({ 'name': filename, 'path': f'/{attachments_directory}/{filename}' }) post_model['embed'] = json.dumps(post_model['embed']) post_model['file'] = json.dumps(post_model['file']) for i in range(len(post_model['attachments'])): post_model['attachments'][i] = json.dumps( post_model['attachments'][i]) columns = post_model.keys() data = ['%s'] * len(post_model.values()) data[-1] = '%s::jsonb[]' # attachments query = "INSERT INTO booru_posts ({fields}) VALUES ({values})".format( fields=','.join(columns), values=','.join(data)) cursor3 = conn.cursor() cursor3.execute(query, list(post_model.values())) conn.commit() except DownloaderException: continue conn.close() if scraper_data['body'].get('nextUrl'): import_posts(key, scraper_data['body']['nextUrl'])
def master_list_online(update=False): ''' Get the dataframe of all known hospitals, building it from Joint Commission certification if it doesn't exist, and optionally updating it to capture additions to the JC list. ''' if MASTER_LIST.exists(): existing = load_hospitals(MASTER_LIST) else: columns = [ 'CenterID', 'CenterType', 'OrganizationName', 'City', 'State', 'PostalCode', 'Name', 'Address', 'Latitude', 'Longitude', 'Failed_Lookup', 'destination', 'destinationID', 'transfer_time', 'DTN_1st', 'DTN_Median', 'DTN_3rd', 'DTP_1st', 'DTP_Median', 'DTP_3rd' ] existing = pd.DataFrame(columns=columns).set_index('CenterID') existing.Failed_Lookup = existing.Failed_Lookup.astype(bool) if update or existing.empty: jc_file = download.download_file(JC_URL, 'Joint Commission') jc_data = pd.read_excel(jc_file) program_map = { 'Advanced Comprehensive Stroke Center ': 'Comprehensive', 'Advanced Primary Stroke Center ': 'Primary', # Treatment of TSCs is undecided; taking conservative approach 'Advanced Thrombectomy Capable Stroke Ctr': 'Primary', } jc_data['CenterType'] = jc_data.CertificationProgram.map(program_map) jc_data = jc_data.dropna() # For multiple certifications, keep the comprehensive line # NOTE - This ignores effective dates under the assumption that all # listed certifications are active jc_data = jc_data.sort_values('CenterType') jc_data = jc_data.drop_duplicates( subset=['OrganizationId', 'City', 'State', 'PostalCode']) update_index = ['OrganizationName', 'City', 'State', 'PostalCode'] jc_data = jc_data.set_index(update_index, verify_integrity=True) existing = existing.reset_index().set_index(update_index) new = jc_data[~jc_data.index.isin(existing.index)] new['Failed_Lookup'] = False out = pd.concat([existing, new], sort=False) out.update(jc_data) out = out.reset_index() next_ID = out.CenterID.max() + 1 if pd.isnull(next_ID): next_ID = 1 for i in out.index: if pd.isnull(out.CenterID[i]): out.loc[i, 'CenterID'] = next_ID next_ID += 1 out.CenterID = out.CenterID.astype(int) out = out.set_index('CenterID', verify_integrity=True) _save_master_list(out, savedir=MASTER_LIST) else: out = existing return out
logging.debug("[#%d] %s - %d (%.02fM)", i + 1, audio.url, remote_sz, remote_sz / utils.MB) if sz != remote_sz: if logging.getLogger().isEnabledFor(logging.INFO): progress_bar = \ AudioDownloadProgressBar(i, total, m_th, audio.short_name) else: progress_bar = None logging.log(logging.WARNING if m_th or not progress_bar else logging.DEBUG, "[#%d/%d] \"%s\" (%s, %s bytes)", i + 1, total, filepath, audio.id_name, remote_sz - sz) download_file(tmp_name, audio.url, progress_bar) try: os.rename(tmp_name, filepath) except OSError as e: print tmp_name, " AND ", filepath print type(tmp_name), " AND ", type(filepath) raise RuntimeError("Can't rename: %s -> %s" % (tmp_name, filepath)) relpath = os.path.relpath(os.path.abspath(path), os.path.abspath(lnks_path)) new_path = utils.norm_path(relpath, audio.name) lnk_fmt = "%%0%dd=%%s" % len(str(total)) lnk = lnk_fmt % (i + 1, audio.name)
def import_posts(key): conn = psycopg2.connect(host=config.database_host, dbname=config.database_dbname, user=config.database_user, password=config.database_password, cursor_factory=RealDictCursor) dlconfig.set(('output'), "mode", "null") dlconfig.set(('extractor', 'subscribestar'), "cookies", {"auth_token": key}) dlconfig.set(('extractor', 'subscribestar'), "proxy", get_proxy()) j = job.DataJob("https://subscribestar.adult/feed") j.run() for message in j.data: try: if message[0] == Message.Directory: post = message[-1] file_directory = f"files/subscribestar/{post['author_name']}/{post['post_id']}" attachments_directory = f"attachments/subscribestar/{post['author_name']}/{post['post_id']}" cursor1 = conn.cursor() cursor1.execute( "SELECT * FROM dnp WHERE id = %s AND service = 'subscribestar'", (post['author_name'], )) bans = cursor1.fetchall() if len(bans) > 0: continue check_for_flags('subscribestar', post['author_name'], str(post['post_id'])) cursor2 = conn.cursor() cursor2.execute( "SELECT * FROM booru_posts WHERE id = %s AND service = 'subscribestar'", (str(post['post_id']), )) existing_posts = cursor2.fetchall() if len(existing_posts) > 0: continue stripped_content = strip_tags(post['content']) post_model = { 'id': str(post['post_id']), '"user"': post['author_name'], 'service': 'subscribestar', 'title': (stripped_content[:60] + '..') if len(stripped_content) > 60 else stripped_content, 'content': post['content'], 'embed': {}, 'shared_file': False, 'added': datetime.datetime.now(), 'published': post['date'], 'edited': None, 'file': {}, 'attachments': [] } for attachment in list( filter( lambda msg: post['post_id'] == msg[-1]['post_id'] and msg[0] == Message.Url, j.data)): if (len(post_model['file'].keys()) == 0): filename, _ = download_file( join(config.download_path, file_directory), attachment[-1]['url'], name=attachment[-1]['filename'] + '.' + attachment[-1]['extension']) post_model['file']['name'] = attachment[-1][ 'filename'] + '.' + attachment[-1]['extension'] post_model['file'][ 'path'] = f'/{file_directory}/{filename}' else: filename, _ = download_file( join(config.download_path, attachments_directory), attachment[-1]['url'], name=attachment[-1]['filename'] + '.' + attachment[-1]['extension']) post_model['attachments'].append({ 'name': attachment[-1]['filename'] + '.' + attachment[-1]['extension'], 'path': f'/{attachments_directory}/{filename}' }) post_model['embed'] = json.dumps(post_model['embed']) post_model['file'] = json.dumps(post_model['file']) for i in range(len(post_model['attachments'])): post_model['attachments'][i] = json.dumps( post_model['attachments'][i]) columns = post_model.keys() data = ['%s'] * len(post_model.values()) data[-1] = '%s::jsonb[]' # attachments query = "INSERT INTO booru_posts ({fields}) VALUES ({values})".format( fields=','.join(columns), values=','.join(data)) cursor3 = conn.cursor() cursor3.execute(query, list(post_model.values())) conn.commit() except DownloaderException: continue conn.close()
def get_spacex(): url = 'https://api.spacexdata.com/v3/launches/latest' response = requests.get(url) for id, images_link in enumerate(response.json()['links']['flickr_images'], 1): download_file(images_link, "Space-{}.jpeg".format(id))
def hack_paper(): status = checkCronJob.checkCronStatus() # print status if (status == 0): print "JOB's already done" return todaysDate = Cur_date.getCurDate() pdf_docs = [] pages = get_pages.getPages() + 1 # pages = 2 dir_path = os.path.dirname(os.path.realpath(__file__)) for pageno in xrange(1, pages): for city in ['smt', 'mdb', 'bgh']: url = "http://epaper.jagran.com/epaperimages/" + todaysDate + "/muzaffarpur/" + str( Cur_date.getPrevDayDate()) + city + "-pg" + str( pageno) + "-0.pdf" # url = "http://epaper.jagran.com/epaperimages/"+"26012018"+"/muzaffarpur/"+"25"+city+"-pg"+ str(pageno) +"-0.pdf" print url ##sending file path ## file path also contains the file name of the downloaded file file_path = dir_path + "/" + str(pageno) + ".pdf" print "Downloading...page no = ", pageno ## this fn ret value that ensures if we have got valid pdf, if not loop continues flag = download.download_file(url, file_path) # flag = pdf_merger.check_valid_pdf(file_path) if (flag == 0): pdf_docs.append(file_path) break #As soon as it gets a valid pdf add to the list 'pdf_docs' else skip else: # os.remove(file_path) print "PAGE NO", pageno, "with city =", city, "DONT EXIST" continue # print pdf_docs final_file_path = dir_path + "/" + todaysDate + ".pdf" pdf_merger.FileMerger(pdf_docs, final_file_path) ## if the compression dont compress to the required size than the last option is to eliminate some files; checkSizeFlag = checkFileSize.check(final_file_path) k = 1 while checkSizeFlag: os.remove( final_file_path ) # have to remove becoz pdf_merger only merges if th ethe file dont exist pdf_merger.FileMerger(pdf_docs[:-k], final_file_path) checkSizeFlag = checkFileSize.check(final_file_path) print "++++++++++ Removed last %s" % (k), 'file +++++++++++++' k = k + 1 ##======================================================================================================= ##Hindi text akhbaar = u'\u0905' + u'\u0916' + u'\u093c' + u'\u092c' + u'\u093e' + u'\u0930' dinakit = u'\u0926' + u'\u093f' + u'\u0928' + u'\u093e' + u'\u0902' + u'\u0915' + u'\u093f' + u'\u0924' # print akhbaar +' ' +dinakit subject = akhbaar + ' ' + dinakit + ' ' + todaysDate # file_path = dir_path + "/" + final_file_name ###for qpython -- files download in this directory # cd_dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) # file_path = cd_dir_path + "/" + final_file_name try: ## done in case when the more than one scripts run simultaneously. ##this can happen when network is slow or the script 1 is taking long enough time to execute print "Checking if mail is already sent ..... " status = checkCronJob.checkCronStatus() print status if (status == 0): print "Mail has been sent already..." return print "SENDING EMAIL..............." send_email.send_mail(configg.fromaddr, configg.password, configg.toaddr, subject, todaysDate + ".pdf", final_file_path) pdf_docs.append(final_file_path) Delete_Files.del_files(pdf_docs) ##updating cron Flag file when the job is done for the day with open( '/home/gugli/Documents/script_py/Dainik_Jagron/checkCronStatus.txt', 'w') as outFile: outFile.write(Cur_date.strfTime()) except Exception as e: Delete_Files.del_files(pdf_docs) print "COULDNOT SEND MAIL...." print e
for download in downloads(attrs={'class': 'download'}): classname = downloads.attrs['class'][1] url = download(attrs={'class': 'a'})[0] weburl = url['data-web'] filename = weburl.split("?key=")[0] if filename[0:25] == 'https://hb1.ssl.hwcdn.net': md5 = download(attrs={'class': 'dlmd5'})[0]['href'][1:] filename = filename.split("/")[-1] path = "%s/All/%s/%s/%s" % (outfolder, title, systems[classname], filename) if not os.path.isdir("%s/All/%s" % (outfolder, title)): os.mkdir("%s/All/%s" % (outfolder, title)) if not os.path.isdir("%s/All/%s/%s" % (outfolder, title, systems[classname])): os.mkdir("%s/All/%s/%s" % (outfolder, title, systems[classname])) if os.path.exists(path) and not os.path.exists( "%s.md5" % path): print("Generating md5: %s" % filename) md5file = md5sum(path) if md5file == md5: print("OK") open("%s.md5" % path, "w").write(md5file) else: print("MISMATCH!") os.unlink(path) exit(1) elif not os.path.exists(path): print("Downloading: %s" % filename) download_file(weburl, path, md5)
def import_posts(key, url=initial_api): conn = psycopg2.connect(host=config.database_host, dbname=config.database_dbname, user=config.database_user, password=config.database_password, cursor_factory=RealDictCursor) scraper = cloudscraper.create_scraper() scraper_data = scraper.get(url, cookies={ 'session_id': key }, proxies=get_proxy()).json() for post in scraper_data['data']: try: file_directory = f"files/{post['relationships']['user']['data']['id']}/{post['id']}" attachments_directory = f"attachments/{post['relationships']['user']['data']['id']}/{post['id']}" cursor1 = conn.cursor() cursor1.execute( "SELECT * FROM dnp WHERE id = %s AND service = 'patreon'", (post['relationships']['user']['data']['id'], )) bans = cursor1.fetchall() if len(bans) > 0: continue check_for_flags('patreon', post['relationships']['user']['data']['id'], post['id']) cursor2 = conn.cursor() cursor2.execute( "SELECT * FROM booru_posts WHERE id = %s AND service = 'patreon'", (post['id'], )) existing_posts = cursor2.fetchall() if len(existing_posts) > 0: continue post_model = { 'id': post['id'], '"user"': post['relationships']['user']['data']['id'], 'service': 'patreon', 'title': post['attributes']['title'], 'content': '', 'embed': {}, 'shared_file': False, 'added': datetime.datetime.now(), 'published': post['attributes']['published_at'], 'edited': post['attributes']['edited_at'], 'file': {}, 'attachments': [] } if post['attributes']['content']: post_model['content'] = post['attributes']['content'] for image in text.extract_iter(post['attributes']['content'], '<img data-media-id="', '>'): download_url = text.extract(image, 'src="', '"')[0] path = urlparse(download_url).path ext = splitext(path)[1] fn = str(uuid.uuid4()) + ext filename, _ = download_file(join(config.download_path, 'inline'), download_url, name=fn) post_model['content'] = post_model['content'].replace( download_url, f"/inline/{filename}") if post['attributes']['embed']: post_model['embed']['subject'] = post['attributes']['embed'][ 'subject'] post_model['embed']['description'] = post['attributes'][ 'embed']['description'] post_model['embed']['url'] = post['attributes']['embed']['url'] if post['attributes']['post_file']: filename, _ = download_file( join(config.download_path, file_directory), post['attributes']['post_file']['url'], name=post['attributes']['post_file']['name']) post_model['file']['name'] = post['attributes']['post_file'][ 'name'] post_model['file']['path'] = f'/{file_directory}/{filename}' for attachment in post['relationships']['attachments']['data']: filename, _ = download_file( join(config.download_path, attachments_directory), f"https://www.patreon.com/file?h={post['id']}&i={attachment['id']}", cookies={'session_id': key}) post_model['attachments'].append({ 'name': filename, 'path': f'/{attachments_directory}/{filename}' }) if post['relationships']['images']['data']: for image in post['relationships']['images']['data']: for media in list( filter( lambda included: included['id'] == image['id'], scraper_data['included'])): if media['attributes']['state'] != 'ready': continue filename, _ = download_file( join(config.download_path, attachments_directory), media['attributes']['download_url'], name=media['attributes']['file_name']) post_model['attachments'].append({ 'name': filename, 'path': f'/{attachments_directory}/{filename}' }) if post['relationships']['audio']['data']: for audio in post['relationships']['audio']['data']: for media in list( filter( lambda included: included['id'] == audio['id'], scraper_data['included'])): if media['attributes']['state'] != 'ready': continue filename, _ = download_file( join(config.download_path, attachments_directory), media['attributes']['download_url'], name=media['attributes']['file_name']) post_model['attachments'].append({ 'name': filename, 'path': f'/{attachments_directory}/{filename}' }) post_model['embed'] = json.dumps(post_model['embed']) post_model['file'] = json.dumps(post_model['file']) for i in range(len(post_model['attachments'])): post_model['attachments'][i] = json.dumps( post_model['attachments'][i]) columns = post_model.keys() data = ['%s'] * len(post_model.values()) data[-1] = '%s::jsonb[]' # attachments query = "INSERT INTO booru_posts ({fields}) VALUES ({values})".format( fields=','.join(columns), values=','.join(data)) cursor3 = conn.cursor() cursor3.execute(query, list(post_model.values())) conn.commit() except DownloaderException: continue conn.close() if scraper_data['links'].get('next'): import_posts(key, 'https://' + scraper_data['links']['next'])