Ejemplo n.º 1
0
 def download_from_urls(data):
     filenames = []
     for obj in data["StandardData"]:
         fname = obj["urls"][0].split('/')[-1]
         download.download_file(obj["urls"][0], "./data/download/"+fname)
         filenames.append(fname)
     return filenames
Ejemplo n.º 2
0
def down():
    #file = 'https://s3.amazonaws.com/google-landmark/metadata/train.csv'
    file = 'https://s3.amazonaws.com/google-landmark/metadata/train_attribution.csv'
    download_file(file, './train_attribution.csv')
    

    
Ejemplo n.º 3
0
def Run():
    # if len(sys.argv) != 3:
    #   print('Syntax: %s <data_file.csv> <output_dir/>' % sys.argv[0])
    #   sys.exit(0)
    # (data_file, out_dir) = sys.argv[1:]
    data_file = './test_csv/test.csv'
    #out_dir = './test_images'

    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    key_url_list = ParseData(data_file)
    for key, url in tqdm.tqdm(key_url_list):
        filename = os.path.join(out_dir, '%s.jpg' % key)

        if os.path.exists(filename):
            print('Image %s already exists. Skipping download.' % filename)
            continue

        for ti in range(5):
            try:
                download_file(url, filename)
                break
            except Exception as e:
                # file = 'data/train_images/' + img
                if os.path.exists(filename):
                    os.remove(filename)
                time.sleep(10)
        time.sleep(3)
Ejemplo n.º 4
0
def download_all(url, path, prefix, ext=".mp3"):
    """Pobiera wszystkie podcasty

    Argumenty:
     url - adres do rss
     path - ścieżka do zapisu plików
     prefix - początek nazwy plku
     ext - rozszerzenie pliku (chyba zawsze .mp3)
    Następnie modyfikuje tagi mp3:
     - zmienia tag album na miesiąc i rok
     - zmiena tag title na datę podcastu
    Przy pobieraniu plików korzysta z modułu download, który
    wyświetla także pasek postępu.
    """
    try:
        usock = urllib2.urlopen(url)
    except IOError:
        print("Nieprawidłowy adres lub błąd połączenia")
        sys.exit(1)
    parser = TokFmPodcastsParser(usock)
    data = parser.get_all()
    for d in data:
        filename = prefix + "-" + "_".join(d["date"]) + ext
        file_path = os.path.join(path, filename)
        if os.path.isfile(file_path):
            print("{0} istnieje, pomijam go".format(filename))
        else:
            print("{0}".format(filename))
            download.download_file(d["url"], file_path)
            # zmieniam tag album na miesiąc i rok, ustawiam
            # tytuł na datę podcastu
            title = ".".join(d["date"])
            album = ".".join(d["date"][1:])
            edit_id3(file_path, title, album)
Ejemplo n.º 5
0
 def download_from_urls(data):
     filenames = []
     for obj in data["StandardData"]:
         fname = obj["urls"][0].split('/')[-1]
         download.download_file(obj["urls"][0], "./data/download/" + fname)
         filenames.append(fname)
     return filenames
Ejemplo n.º 6
0
 def get_file(self):
     if self.temp_file == None:
         fd, self.temp_file = tempfile.mkstemp('_e621dl.' + self.file_ext,
                                               str(self.id))
         with open(self.temp_file, 'w') as temp:
             download.download_file(self.file_url, temp)
         os.close(fd)
     return self.temp_file
Ejemplo n.º 7
0
 def get_preview(self):
     if self.temp_preview == None:
         extension = self.preview_url[::-1].split('.')[0][::-1]
         fd, self.temp_preview = tempfile.mkstemp('_e621dl.' + extension,
                                                  str(self.id))
         with open(self.temp_preview, 'w') as temp:
             download.download_file(self.preview_url, temp)
         os.close(fd)
     return self.temp_preview
Ejemplo n.º 8
0
def get_hubble(image_id, ext='jpeg'):
    status_ready = None
    response = requests.get(
        'http://hubblesite.org/api/v3/image/{}'.format(image_id))
    data = response.json()['image_files']
    image_name = "Hubble-{}.{}".format(image_id, ext)
    loading_list = [
        image['file_url'] for image in data if image['file_url'].endswith(ext)
    ]
    for picture_url in loading_list:
        download_file(picture_url, image_name)
Ejemplo n.º 9
0
 def idempotent_download(self, path) -> bool:
     name = self.file_type + ('.zip' if self.zipped else '')
     file_path = path / name
     if not file_path.exists():
         log.info('Downloading new file `%s` from URL `%s`', file_path,
                  self.to_url())
         try:
             download_file(self.to_url(), file_path)
             return True
         except Exception:
             log.warning('Failed to download file `%s` from URL `%s`',
                         exc_info=True)
             return False
     else:
         log.info('Skipping download of file `%s`', file_path)
         return True
Ejemplo n.º 10
0
def _download_data(state):
    '''
    Download census data for the given state and unzip the file into the
        expected directory.
    '''
    zipped = download.download_file(_ftp_path(state), state.name)

    with zipfile.ZipFile(zipped) as z:
        z.extractall(_get_path(state))
Ejemplo n.º 11
0
def _retrieve_uniprot_file(uniprot_acc, download_dir):
    """
    Download a Uniprot file by accession.
    """
    src = uniprot_url % uniprot_acc
    src = db_fetch_url % uniprot_acc
    dst = os.path.join(download_dir, '%s.txt' % uniprot_acc)
    dst = download_file(src, dst)
    return dst
def get_us_county_shapefiles():
    filepath = CENSUS_DATA_PATH / 'tl_2018_us_county' / 'tl_2018_us_county.shp'
    if not filepath.exists():
        # download from internet
        shape_link = 'https://www2.census.gov/geo/tiger/TIGER2018/COUNTY/'\
        +'tl_2018_us_county.zip'
        zipped = download.download_file(shape_link, "US Counties")
        with zipfile.ZipFile(zipped) as z:
            z.extractall(filepath.parent)
    gdf = gpd.read_file('census_data/tl_2018_us_county/tl_2018_us_county.shp')
    return gdf
Ejemplo n.º 13
0
def download_current(url, path, prefix, ext=".mp3"):
    """Pobiera aktualny podcast"""
    try:
        usock = urllib2.urlopen(url)
    except IOError:
        print("Nieprawidłowy adres lub błąd połączenia")
        sys.exit(1)
    parser = TokFmPodcastsParser(usock)
    data = parser.get_current()
    filename = prefix + "-" + "_".join(data["date"]) + ext
    file_path = os.path.join(path, filename)
    if os.path.isfile(file_path):
        print("{0} istnieje, pomijam go".format(filename))
    else:
        print("{0}".format(filename))
        download.download_file(data["url"], file_path)
        # zmieniam tag album na miesiąc i rok, ustawiam
        # tytuł na datę podcastu
        title = ".".join(data["date"])
        album = ".".join(data["date"][1:])
        edit_id3(file_path, title, album)
Ejemplo n.º 14
0
def _batch_retrieve_uniprot_files(uniprot_accessions, download_dir,
                                  dataset_prefix, initial_batch_size=120,
                                  sleep_interval=15):
    """ Download Uniprot files in batches """

    N = len(uniprot_accessions)
    batch_size = initial_batch_size
    i = 0   # downloaded records counter
    j = 0   # number of downloaded files

    # expected number of total files to be downloaded
    M = N // batch_size + (1 if N % batch_size else 0)

    entries = []
    while i < N:
        while True:
            repeat_download = False
            ids = ','.join(uniprot_accessions[i: i + batch_size])
            src = db_fetch_url % ids
            fn = '%s_uniprot_batch_%s.txt' % (dataset_prefix, _get_ids_hash(ids))
            dst = os.path.join(download_dir, fn)

            if not os.path.exists(dst):
                print "    Downloading file: %s (%d/%d)" % (fn, j+1, M)
                try:
                    dst = download_file(src, dst)
                    time.sleep(sleep_interval)  # wait a little
                except Exception:
                    print "      FAILURE"
                    if batch_size > 1:
                        batch_size = batch_size // 2
                        print "      Halving batch size to %d." % batch_size
                        repeat_download = True
                    else:
                        print "      Skipping %s." % ids

            if not repeat_download:
                entries += _process_uniprot_file(dst)
                i += batch_size
                batch_size = initial_batch_size
                r = 1 if (N-i) % batch_size else 0
                j += 1
                M = j + (N-i) // batch_size + r
                break

    acc_map = {}
    for ac, gn, orgn, taxid, pe, geneids in entries:
        for acc in ac:
            if acc in uniprot_accessions:
                acc_map[acc] = (gn, orgn, taxid, pe, geneids)
    return acc_map
Ejemplo n.º 15
0
def download_and_extract_biogrid(src, download_dir):
    """
    Download and extract a BioGRID dataset (zipped).
    """
    pathname = urllib.url2pathname(src)
    biogrid_zipfile = os.path.basename(urlparse.urlsplit(pathname)[2])
    zipfile_path = os.path.join(download_dir, biogrid_zipfile)

    if not os.path.exists(zipfile_path):
        print "Downloading latest BioGRID."
        download_file(src, zipfile_path)
    zf = zipfile.ZipFile(zipfile_path, 'r', allowZip64=True)
    biogrid_file = _get_archived_filename(zf)
    extracted_path = os.path.join(download_dir, biogrid_file)

    if not os.path.exists(extracted_path):
        print "Extracting latest BioGRID."
        out_fp = open(extracted_path, 'w')
        out_fp.write(zf.read(biogrid_file))
        out_fp.close()

    zf.close()
    return extracted_path
Ejemplo n.º 16
0
def JC_StrokeCetification(path=RAW_DATA / 'StrokeCertificationList.xlsx'):
    JC_URL = "https://www.qualitycheck.org/file.aspx?FolderName=" + "StrokeCertification&c=1"
    if not path.exists():
        download.download_file(JC_URL, 'Joint Commission', savedir=path)
    return pd.read_excel(path, dtype=str)
Ejemplo n.º 17
0
    def hh_method(self, num):
        todir = self.save_path_entry.get()
        font_dir = self.check_bdo_dir()
        tmpdirname = str(gettempdir())

        ads_dir = todir + '\\ads'
        temp_loc_dir = tmpdirname + r'\split_loc'
        temp_font_dir = tmpdirname + r'\split_fonts'
        temp_bdocn_dir = tmpdirname + r'\bdocn_temp'

        tw_loc = 'http://dn.blackdesert.com.tw/UploadData/ads/languagedata_tw.loc'
        github_loc = 'https://github.com/BDO-CnHope/bdocn/raw/master/ads/languagedata_en.loc'
        github_font = 'https://github.com/BDO-CnHope/bdocn/raw/master/prestringtable/font/pearl.ttf'
        gitee_loc = 'https://gitee.com/bdo-cnhope/bdocn/tree/master/split/'
        gitee_font = 'https://gitee.com/bdo-cnhope/bdocn/tree/master/split_font/'
        en_loc_zip = download.download_en_loc()

        try:
            if exists(temp_loc_dir) == False:
                mkdir(temp_loc_dir)
            elif exists(temp_font_dir) == False:
                mkdir(temp_font_dir)
            elif exists(temp_bdocn_dir) == False:
                mkdir(temp_bdocn_dir)
        except:
            self.insert_text('操作错误,请重试...code: 1 \n')
            pass
        else:
            if exists(temp_loc_dir) == False:
                mkdir(temp_loc_dir)
            elif exists(temp_font_dir) == False:
                mkdir(temp_font_dir)
            elif exists(temp_bdocn_dir) == False:
                mkdir(temp_bdocn_dir)
            try:
                if num == 1:
                    if check_new.get_loc_hash(1) != self.check_loc_hash():
                        self.insert_text('正在使用国外线路下载简体汉化语言包…… \n')
                        download.download_file(github_loc, ads_dir,
                                               'languagedata_en.loc')
                        self.insert_text('简体汉化包已更新! \n')
                    else:
                        self.insert_text('简体汉化包已是最新的了! \n')
                    if check_new.get_font_hash(1) != self.check_font_hash():
                        self.insert_text('正在下载字体包…… \n')
                        download.download_file(github_font, font_dir,
                                               'pearl.ttf')
                        self.insert_text('字体包已更新! \n')
                    else:
                        self.insert_text('字体包已是最新的了! \n')
                    showinfo('提示', '汉化已完成!')
                elif num == 2:
                    self.insert_text('正在下载繁体汉化语言包…… \n')
                    download.download_file(tw_loc, ads_dir,
                                           'languagedata_en.loc')
                    self.insert_text('繁体汉化包已更新! \n')
                    if check_new.get_font_hash(1) != self.check_font_hash():
                        self.insert_text('正在下载字体包…… \n')
                        download.download_file(github_font, font_dir,
                                               'pearl.ttf')
                        self.insert_text('字体包已更新! \n')
                    else:
                        self.insert_text('字体包已是最新的了! \n')
                    showinfo('提示', '汉化已完成!')
                elif num == 3:
                    if check_new.get_font_hash(1) != self.check_font_hash():
                        self.insert_text('正在下载字体包…… \n')
                        download.download_file(github_font, font_dir,
                                               'pearl.ttf')
                        self.insert_text('字体包已更新! \n')
                    else:
                        self.insert_text('字体包已是最新的了! \n')
                    showinfo('提示', '汉化已完成!')
                elif num == 11:
                    if check_new.get_loc_hash(2) != self.check_loc_hash():
                        self.insert_text('正在使用国内线路下载简体汉化语言包…… \n')
                        download.download_split_files(gitee_loc, temp_loc_dir)
                        joinfiles.join_files(temp_loc_dir, ads_dir,
                                             'languagedata_en.loc')
                        self.insert_text('简体汉化包已更新! \n')
                    else:
                        self.insert_text('简体汉化包已是最新的了! \n')
                    if check_new.get_font_hash(2) != self.check_font_hash():
                        self.insert_text('正在下载字体包…… \n')
                        download.download_split_files(gitee_font,
                                                      temp_font_dir)
                        joinfiles.join_files(temp_font_dir, font_dir,
                                             'pearl.ttf')
                        self.insert_text('字体包已更新! \n')
                    else:
                        self.insert_text('字体包已是最新的了! \n')
                    showinfo('提示', '汉化已完成!')
                elif num == 12:
                    self.insert_text('正在下载繁体汉化语言包…… \n')
                    download.download_file(tw_loc, ads_dir,
                                           'languagedata_en.loc')
                    self.insert_text('繁体汉化包已更新! \n')
                    if check_new.get_font_hash(2) != self.check_font_hash():
                        self.insert_text('正在下载字体包…… \n')
                        download.download_split_files(gitee_font,
                                                      temp_font_dir)
                        joinfiles.join_files(temp_font_dir, font_dir,
                                             'pearl.ttf')
                        self.insert_text('字体包已更新! \n')
                    else:
                        self.insert_text('字体包已是最新的了! \n')
                    showinfo('提示', '汉化已完成!')
                elif num == 13:
                    if check_new.get_font_hash(2) != self.check_font_hash():
                        self.insert_text('正在下载字体包…… \n')
                        download.download_split_files(gitee_font,
                                                      temp_font_dir)
                        joinfiles.join_files, (temp_font_dir, font_dir,
                                               'pearl.ttf')
                        self.insert_text('字体包已更新! \n')
                    else:
                        self.insert_text('字体包已是最新的了! \n')
                    showinfo('提示', '汉化已完成!')
                elif num == 4:
                    self.insert_text('正在重新安装美服英语包…… \n')
                    unzip_dir = temp_bdocn_dir + '\\loc'
                    download.download_file(en_loc_zip, temp_bdocn_dir,
                                           'BDOLanguage.zip')
                    unzip.un_zip(temp_bdocn_dir, 'BDOLanguage.zip', unzip_dir)
                    copy(unzip_dir + '\\' + 'languagedata_en.loc', ads_dir)
                    self.insert_text('已恢复为美服英语! \n')
                    showinfo('提示', '任务已完成!')
            except:
                self.insert_text('操作错误,请重试...code: 2 \n')
                if exists(temp_loc_dir) == True:
                    rmtree(temp_loc_dir)
                elif exists(temp_font_dir) == True:
                    rmtree(temp_font_dir)
                elif exists(temp_bdocn_dir) == True:
                    rmtree(temp_bdocn_dir)
Ejemplo n.º 18
0
def sub2save(name, dest, sm):
    return dest, sm[0], sm[1], name, download_file(sm[1])
    DATABASE_NAME = os.path.join(__location__, 'data.sqlite')
    conn = sqlite3.connect(DATABASE_NAME)

    # city of zurich - start url
    start_url = 'https://www.stadt-zuerich.ch/ssd/de/index/volksschule/schulferien.html'

    # page for each year
    content = dl.download_content(start_url)
    soup = BeautifulSoup(content, 'html.parser')
    nav = soup.find('li', {'class': 'var_wrapping_node var_active'})
    pages = nav.find_all('a', string=re.compile(r'^\d{4}/\d{2}$'))

    for page in pages:
        year_href = page.get('href')
        year_url = urljoin(start_url, year_href)
        download_url = get_ics_download_url(year_url)
        filename = os.path.basename(download_url)
        file_path = os.path.join(__location__, filename)
        dl.download_file(download_url, file_path)
        print(f"Download URL: {download_url}")
        events = parse_ics.parse_file(file_path)
        insert_or_update(events, conn)

    conn.commit()
except Exception as e:
    print("Error: %s" % e)
    print(traceback.format_exc())
    raise
finally:
    conn.close()
Ejemplo n.º 20
0
import pymysql.cursors, time
import download,os

db = pymysql.connect("localhost", 'root', 'yuwenque', 'keep', charset='utf8mb4')
cursor = db.cursor(cursor=pymysql.cursors.DictCursor)

sql ='''
select avatar,gender,userid from keep_user_info where gender ='F' and birthday like '199%' or birthday like '200%' or birthday like '201%' limit 5001,10000 
'''
path = '/Users/yuwenque/Downloads/keepuser/'


cursor.execute(sql)
list = cursor.fetchall()
for i in range(0, len(list)):
    item = list[i]
    photo_name = path + item['gender'] + "/" + item['userid'] + ".jpg"
    if i % 10 == 0 and i != 0:
        time.sleep(10)
    try:
        if not os.path.exists(photo_name):
            try:
                download.download_file(item['avatar'], photo_name)
            except Exception as e2:
                os.remove(photo_name)
                print(e2)

            time.sleep(3)
    except Exception as e:
        print(e)
Ejemplo n.º 21
0
def hack_paper():

	status = checkCronJob.checkCronStatus()
	print status
	if(status == 0):
		print "JOB's already done"
		return


	todaysDate = Cur_date.getCurDate()

	pdf_docs = []
	pages = get_pages.getPages()+1

	# pages = 2

	dir_path = os.path.dirname(os.path.realpath(__file__))

	for pageno in xrange(1,pages):
		
		
		for city in ['smt','mdb','bgh']:
			url = "http://epaper.jagran.com/epaperimages/"+todaysDate+"/muzaffarpur/"+str(Cur_date.getPrevDayDate())+city+"-pg"+ str(pageno) +"-0.pdf"
			# url = "http://epaper.jagran.com/epaperimages/"+"26012018"+"/muzaffarpur/"+"25"+city+"-pg"+ str(pageno) +"-0.pdf"

			print url

			##sending file path
			## file path also contains the file name of the downloaded file
			file_path = dir_path + "/" + str(pageno)+".pdf"
			print "Downloading...page no = ", pageno

			download.download_file(url,file_path)

			
			flag = pdf_merger.check_valid_pdf(file_path)
			if(flag == 0):
				pdf_docs.append(file_path)
				break #As soon as it gets a valid pdf add to the list 'pdf_docs' else skip
			else:
				os.remove(file_path)
				print "PAGE NO",pageno,"with city =", city, "DONT EXIST"
			
			
		# pdf_docs.append(file_path)



	final_file_path = dir_path + "/" + todaysDate+".pdf"
	pdf_merger.FileMerger(pdf_docs, final_file_path)


	subject = "epaper dated "+ todaysDate

	# file_path = dir_path + "/" + final_file_name

	###for qpython -- files download in this directory
	# cd_dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
	# file_path = cd_dir_path + "/" + final_file_name

	try:
		print "SENDING EMAIL..............."
		send_email.send_mail(configg.fromaddr,configg.password,configg.toaddr,subject,todaysDate+".pdf",final_file_path)

		pdf_docs.append(final_file_path)
		Delete_Files.del_files(pdf_docs)

		##updating cron Flag file when the job is done for the day

		with open('/home/gugli/Documents/script_py/Dainik_Jagron/checkCronStatus.txt','w') as outFile:
			outFile.write( Cur_date.strfTime())

		
	except Exception as e:
		Delete_Files.del_files(pdf_docs)
		print "COULDNOT SEND MAIL...."
		print e
gdf = get_us_county_shapefiles()
# Summary files manually selected and downloaded from American Fact Finder
SF2010_DOWNLOADED_PATH = CENSUS_DATA_PATH / 'SF2010_zips'
SF2010_DOWNLOADED_FILENAME = 'DEC_10_SF1_P12_with_ann.csv'
SF2010_OUTPATH = CENSUS_DATA_PATH / 'SF2010'
# One iteration for each state
state_abbrs = ['NH', 'NJ', 'CT', 'RI', 'ME', 'VT']
for state_abbr in state_abbrs:

    state_fip = us.states.lookup(state_abbr).fips

    # Download shapefile for this state, block level:
    shape_link = 'https://www2.census.gov/geo/tiger/TIGER2018/TABBLOCK/'\
    + f'tl_2018_{state_fip}_tabblock10.zip'
    print(shape_link)
    zipped = download.download_file(shape_link, state_abbr)
    with zipfile.ZipFile(zipped) as z:
        z.extractall(SHAPEFILE_OUTPATH)

    # 2 diff name format depends on which state so just get both
    sfpaths = [SF2010_DOWNLOADED_PATH.glob(f'{state_abbr}_download_?')]
    sfpaths += [SF2010_DOWNLOADED_PATH.glob(f'SF2010_{state_abbr}_?')]

    dflist = []
    for i, sfp in enumerate(sfpaths):
        if i > 0:
            dflist.append(
                pd.read_csv(sfp / SF2010_DOWNLOADED_FILENAME,
                            dtype=str,
                            skiprows=[1]))
        else:
Ejemplo n.º 23
0
def import_posts(key,
                 url='https://api.fanbox.cc/post.listSupporting?limit=50'):
    conn = psycopg2.connect(host=config.database_host,
                            dbname=config.database_dbname,
                            user=config.database_user,
                            password=config.database_password,
                            cursor_factory=RealDictCursor)

    scraper_data = requests.get(url,
                                cookies={
                                    'FANBOXSESSID': key
                                },
                                headers={
                                    'origin': 'https://fanbox.cc'
                                },
                                proxies=get_proxy()).json()

    if scraper_data.get('body'):
        for post in scraper_data['body']['items']:
            parsed_post = FanboxPost(post['id'], None, post)
            if parsed_post.is_restricted:
                continue
            try:
                file_directory = f"files/fanbox/{post['user']['userId']}/{post['id']}"
                attachments_directory = f"attachments/fanbox/{post['user']['userId']}/{post['id']}"

                cursor1 = conn.cursor()
                cursor1.execute(
                    "SELECT * FROM dnp WHERE id = %s AND service = 'fanbox'",
                    (post['user']['userId'], ))
                bans = cursor1.fetchall()
                if len(bans) > 0:
                    continue

                check_for_flags('fanbox', post['user']['userId'], post['id'])

                cursor2 = conn.cursor()
                cursor2.execute(
                    "SELECT * FROM booru_posts WHERE id = %s AND service = 'fanbox'",
                    (post['id'], ))
                existing_posts = cursor2.fetchall()
                if len(existing_posts) > 0:
                    continue

                post_model = {
                    'id': post['id'],
                    '"user"': post['user']['userId'],
                    'service': 'fanbox',
                    'title': post['title'],
                    'content': parsed_post.body_text,
                    'embed': {},
                    'shared_file': False,
                    'added': datetime.datetime.now(),
                    'published': post['publishedDatetime'],
                    'edited': post['updatedDatetime'],
                    'file': {},
                    'attachments': []
                }

                for i in range(len(parsed_post.embeddedFiles)):
                    if i == 0:
                        filename, _ = download_file(
                            join(config.download_path, file_directory),
                            parsed_post.embeddedFiles[i],
                            cookies={'FANBOXSESSID': key},
                            headers={'origin': 'https://fanbox.cc'})
                        post_model['file']['name'] = filename
                        post_model['file'][
                            'path'] = f'/{file_directory}/{filename}'
                    else:
                        filename, _ = download_file(
                            join(config.download_path, attachments_directory),
                            parsed_post.embeddedFiles[i],
                            cookies={'FANBOXSESSID': key},
                            headers={'origin': 'https://fanbox.cc'})
                        post_model['attachments'].append({
                            'name':
                            filename,
                            'path':
                            f'/{attachments_directory}/{filename}'
                        })

                post_model['embed'] = json.dumps(post_model['embed'])
                post_model['file'] = json.dumps(post_model['file'])
                for i in range(len(post_model['attachments'])):
                    post_model['attachments'][i] = json.dumps(
                        post_model['attachments'][i])

                columns = post_model.keys()
                data = ['%s'] * len(post_model.values())
                data[-1] = '%s::jsonb[]'  # attachments
                query = "INSERT INTO booru_posts ({fields}) VALUES ({values})".format(
                    fields=','.join(columns), values=','.join(data))
                cursor3 = conn.cursor()
                cursor3.execute(query, list(post_model.values()))
                conn.commit()
            except DownloaderException:
                continue

    conn.close()
    if scraper_data['body'].get('nextUrl'):
        import_posts(key, scraper_data['body']['nextUrl'])
Ejemplo n.º 24
0
def master_list_online(update=False):
    '''
    Get the dataframe of all known hospitals, building it from Joint
        Commission certification if it doesn't exist, and optionally updating
        it to capture additions to the JC list.
    '''

    if MASTER_LIST.exists():
        existing = load_hospitals(MASTER_LIST)
    else:
        columns = [
            'CenterID', 'CenterType', 'OrganizationName', 'City', 'State',
            'PostalCode', 'Name', 'Address', 'Latitude', 'Longitude',
            'Failed_Lookup', 'destination', 'destinationID', 'transfer_time',
            'DTN_1st', 'DTN_Median', 'DTN_3rd', 'DTP_1st', 'DTP_Median',
            'DTP_3rd'
        ]
        existing = pd.DataFrame(columns=columns).set_index('CenterID')
        existing.Failed_Lookup = existing.Failed_Lookup.astype(bool)

    if update or existing.empty:
        jc_file = download.download_file(JC_URL, 'Joint Commission')
        jc_data = pd.read_excel(jc_file)

        program_map = {
            'Advanced Comprehensive Stroke Center    ': 'Comprehensive',
            'Advanced Primary Stroke Center          ': 'Primary',
            # Treatment of TSCs is undecided; taking conservative approach
            'Advanced Thrombectomy Capable Stroke Ctr': 'Primary',
        }
        jc_data['CenterType'] = jc_data.CertificationProgram.map(program_map)
        jc_data = jc_data.dropna()

        # For multiple certifications, keep the comprehensive line
        #   NOTE - This ignores effective dates under the assumption that all
        #           listed certifications are active
        jc_data = jc_data.sort_values('CenterType')

        jc_data = jc_data.drop_duplicates(
            subset=['OrganizationId', 'City', 'State', 'PostalCode'])

        update_index = ['OrganizationName', 'City', 'State', 'PostalCode']
        jc_data = jc_data.set_index(update_index, verify_integrity=True)

        existing = existing.reset_index().set_index(update_index)

        new = jc_data[~jc_data.index.isin(existing.index)]
        new['Failed_Lookup'] = False
        out = pd.concat([existing, new], sort=False)
        out.update(jc_data)
        out = out.reset_index()

        next_ID = out.CenterID.max() + 1
        if pd.isnull(next_ID):
            next_ID = 1
        for i in out.index:
            if pd.isnull(out.CenterID[i]):
                out.loc[i, 'CenterID'] = next_ID
                next_ID += 1

        out.CenterID = out.CenterID.astype(int)
        out = out.set_index('CenterID', verify_integrity=True)
        _save_master_list(out, savedir=MASTER_LIST)
    else:
        out = existing

    return out
Ejemplo n.º 25
0
        logging.debug("[#%d] %s - %d (%.02fM)",
                      i + 1, audio.url, remote_sz, remote_sz / utils.MB)

        if sz != remote_sz:
            if logging.getLogger().isEnabledFor(logging.INFO):
                progress_bar = \
                    AudioDownloadProgressBar(i, total, m_th, audio.short_name)
            else:
                progress_bar = None

            logging.log(logging.WARNING if m_th or not progress_bar
                        else logging.DEBUG,
                        "[#%d/%d] \"%s\" (%s, %s bytes)",
                        i + 1, total, filepath, audio.id_name, remote_sz - sz)

            download_file(tmp_name, audio.url, progress_bar)

        try:
            os.rename(tmp_name, filepath)
        except OSError as e:
            print tmp_name, " AND ", filepath
            print type(tmp_name), " AND ", type(filepath)
            raise RuntimeError("Can't rename: %s -> %s" % (tmp_name, filepath))

    relpath = os.path.relpath(os.path.abspath(path),
                              os.path.abspath(lnks_path))
    new_path = utils.norm_path(relpath, audio.name)

    lnk_fmt = "%%0%dd=%%s" % len(str(total))
    lnk = lnk_fmt % (i + 1, audio.name)
Ejemplo n.º 26
0
def import_posts(key):
    conn = psycopg2.connect(host=config.database_host,
                            dbname=config.database_dbname,
                            user=config.database_user,
                            password=config.database_password,
                            cursor_factory=RealDictCursor)

    dlconfig.set(('output'), "mode", "null")
    dlconfig.set(('extractor', 'subscribestar'), "cookies",
                 {"auth_token": key})
    dlconfig.set(('extractor', 'subscribestar'), "proxy", get_proxy())
    j = job.DataJob("https://subscribestar.adult/feed")
    j.run()

    for message in j.data:
        try:
            if message[0] == Message.Directory:
                post = message[-1]

                file_directory = f"files/subscribestar/{post['author_name']}/{post['post_id']}"
                attachments_directory = f"attachments/subscribestar/{post['author_name']}/{post['post_id']}"

                cursor1 = conn.cursor()
                cursor1.execute(
                    "SELECT * FROM dnp WHERE id = %s AND service = 'subscribestar'",
                    (post['author_name'], ))
                bans = cursor1.fetchall()
                if len(bans) > 0:
                    continue

                check_for_flags('subscribestar', post['author_name'],
                                str(post['post_id']))

                cursor2 = conn.cursor()
                cursor2.execute(
                    "SELECT * FROM booru_posts WHERE id = %s AND service = 'subscribestar'",
                    (str(post['post_id']), ))
                existing_posts = cursor2.fetchall()
                if len(existing_posts) > 0:
                    continue

                stripped_content = strip_tags(post['content'])
                post_model = {
                    'id':
                    str(post['post_id']),
                    '"user"':
                    post['author_name'],
                    'service':
                    'subscribestar',
                    'title':
                    (stripped_content[:60] +
                     '..') if len(stripped_content) > 60 else stripped_content,
                    'content':
                    post['content'],
                    'embed': {},
                    'shared_file':
                    False,
                    'added':
                    datetime.datetime.now(),
                    'published':
                    post['date'],
                    'edited':
                    None,
                    'file': {},
                    'attachments': []
                }

                for attachment in list(
                        filter(
                            lambda msg: post['post_id'] == msg[-1]['post_id']
                            and msg[0] == Message.Url, j.data)):
                    if (len(post_model['file'].keys()) == 0):
                        filename, _ = download_file(
                            join(config.download_path, file_directory),
                            attachment[-1]['url'],
                            name=attachment[-1]['filename'] + '.' +
                            attachment[-1]['extension'])
                        post_model['file']['name'] = attachment[-1][
                            'filename'] + '.' + attachment[-1]['extension']
                        post_model['file'][
                            'path'] = f'/{file_directory}/{filename}'
                    else:
                        filename, _ = download_file(
                            join(config.download_path, attachments_directory),
                            attachment[-1]['url'],
                            name=attachment[-1]['filename'] + '.' +
                            attachment[-1]['extension'])
                        post_model['attachments'].append({
                            'name':
                            attachment[-1]['filename'] + '.' +
                            attachment[-1]['extension'],
                            'path':
                            f'/{attachments_directory}/{filename}'
                        })

                post_model['embed'] = json.dumps(post_model['embed'])
                post_model['file'] = json.dumps(post_model['file'])
                for i in range(len(post_model['attachments'])):
                    post_model['attachments'][i] = json.dumps(
                        post_model['attachments'][i])

                columns = post_model.keys()
                data = ['%s'] * len(post_model.values())
                data[-1] = '%s::jsonb[]'  # attachments
                query = "INSERT INTO booru_posts ({fields}) VALUES ({values})".format(
                    fields=','.join(columns), values=','.join(data))
                cursor3 = conn.cursor()
                cursor3.execute(query, list(post_model.values()))
                conn.commit()
        except DownloaderException:
            continue

    conn.close()
Ejemplo n.º 27
0
def get_spacex():
    url = 'https://api.spacexdata.com/v3/launches/latest'
    response = requests.get(url)
    for id, images_link in enumerate(response.json()['links']['flickr_images'], 1):
        download_file(images_link, "Space-{}.jpeg".format(id))
Ejemplo n.º 28
0
def hack_paper():

    status = checkCronJob.checkCronStatus()
    # print status

    if (status == 0):
        print "JOB's already done"
        return

    todaysDate = Cur_date.getCurDate()

    pdf_docs = []
    pages = get_pages.getPages() + 1

    # pages = 2

    dir_path = os.path.dirname(os.path.realpath(__file__))

    for pageno in xrange(1, pages):

        for city in ['smt', 'mdb', 'bgh']:
            url = "http://epaper.jagran.com/epaperimages/" + todaysDate + "/muzaffarpur/" + str(
                Cur_date.getPrevDayDate()) + city + "-pg" + str(
                    pageno) + "-0.pdf"
            # url = "http://epaper.jagran.com/epaperimages/"+"26012018"+"/muzaffarpur/"+"25"+city+"-pg"+ str(pageno) +"-0.pdf"

            print url

            ##sending file path
            ## file path also contains the file name of the downloaded file
            file_path = dir_path + "/" + str(pageno) + ".pdf"
            print "Downloading...page no = ", pageno

            ## this fn ret value that ensures if we have got valid pdf, if not loop continues
            flag = download.download_file(url, file_path)

            # flag = pdf_merger.check_valid_pdf(file_path)

            if (flag == 0):
                pdf_docs.append(file_path)
                break  #As soon as it gets a valid pdf add to the list 'pdf_docs' else skip
            else:
                # os.remove(file_path)
                print "PAGE NO", pageno, "with city =", city, "DONT EXIST"
                continue

    # print pdf_docs
    final_file_path = dir_path + "/" + todaysDate + ".pdf"
    pdf_merger.FileMerger(pdf_docs, final_file_path)

    ## if the compression dont compress to the required size than the last option is to eliminate some files;
    checkSizeFlag = checkFileSize.check(final_file_path)
    k = 1

    while checkSizeFlag:
        os.remove(
            final_file_path
        )  # have to remove becoz pdf_merger only merges if th ethe file dont exist

        pdf_merger.FileMerger(pdf_docs[:-k], final_file_path)
        checkSizeFlag = checkFileSize.check(final_file_path)
        print "++++++++++ Removed last %s" % (k), 'file +++++++++++++'
        k = k + 1
    ##=======================================================================================================

    ##Hindi text
    akhbaar = u'\u0905' + u'\u0916' + u'\u093c' + u'\u092c' + u'\u093e' + u'\u0930'
    dinakit = u'\u0926' + u'\u093f' + u'\u0928' + u'\u093e' + u'\u0902' + u'\u0915' + u'\u093f' + u'\u0924'
    # print akhbaar +' ' +dinakit

    subject = akhbaar + ' ' + dinakit + ' ' + todaysDate

    # file_path = dir_path + "/" + final_file_name

    ###for qpython -- files download in this directory
    # cd_dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
    # file_path = cd_dir_path + "/" + final_file_name

    try:
        ## done in case when the more than one scripts run simultaneously.
        ##this can happen when network is slow or the script 1 is taking long enough time to execute
        print "Checking if mail is already sent ..... "
        status = checkCronJob.checkCronStatus()
        print status
        if (status == 0):
            print "Mail has been sent already..."
            return
        print "SENDING EMAIL..............."
        send_email.send_mail(configg.fromaddr, configg.password,
                             configg.toaddr, subject, todaysDate + ".pdf",
                             final_file_path)

        pdf_docs.append(final_file_path)
        Delete_Files.del_files(pdf_docs)

        ##updating cron Flag file when the job is done for the day

        with open(
                '/home/gugli/Documents/script_py/Dainik_Jagron/checkCronStatus.txt',
                'w') as outFile:
            outFile.write(Cur_date.strfTime())

    except Exception as e:
        Delete_Files.del_files(pdf_docs)
        print "COULDNOT SEND MAIL...."
        print e
Ejemplo n.º 29
0
        for download in downloads(attrs={'class': 'download'}):
            classname = downloads.attrs['class'][1]
            url = download(attrs={'class': 'a'})[0]
            weburl = url['data-web']
            filename = weburl.split("?key=")[0]
            if filename[0:25] == 'https://hb1.ssl.hwcdn.net':
                md5 = download(attrs={'class': 'dlmd5'})[0]['href'][1:]
                filename = filename.split("/")[-1]
                path = "%s/All/%s/%s/%s" % (outfolder, title,
                                            systems[classname], filename)
                if not os.path.isdir("%s/All/%s" % (outfolder, title)):
                    os.mkdir("%s/All/%s" % (outfolder, title))
                if not os.path.isdir("%s/All/%s/%s" %
                                     (outfolder, title, systems[classname])):
                    os.mkdir("%s/All/%s/%s" %
                             (outfolder, title, systems[classname]))
                if os.path.exists(path) and not os.path.exists(
                        "%s.md5" % path):
                    print("Generating md5: %s" % filename)
                    md5file = md5sum(path)
                    if md5file == md5:
                        print("OK")
                        open("%s.md5" % path, "w").write(md5file)
                    else:
                        print("MISMATCH!")
                        os.unlink(path)
                        exit(1)
                elif not os.path.exists(path):
                    print("Downloading: %s" % filename)
                    download_file(weburl, path, md5)
Ejemplo n.º 30
0
def import_posts(key, url=initial_api):
    conn = psycopg2.connect(host=config.database_host,
                            dbname=config.database_dbname,
                            user=config.database_user,
                            password=config.database_password,
                            cursor_factory=RealDictCursor)

    scraper = cloudscraper.create_scraper()
    scraper_data = scraper.get(url,
                               cookies={
                                   'session_id': key
                               },
                               proxies=get_proxy()).json()

    for post in scraper_data['data']:
        try:
            file_directory = f"files/{post['relationships']['user']['data']['id']}/{post['id']}"
            attachments_directory = f"attachments/{post['relationships']['user']['data']['id']}/{post['id']}"

            cursor1 = conn.cursor()
            cursor1.execute(
                "SELECT * FROM dnp WHERE id = %s AND service = 'patreon'",
                (post['relationships']['user']['data']['id'], ))
            bans = cursor1.fetchall()
            if len(bans) > 0:
                continue

            check_for_flags('patreon',
                            post['relationships']['user']['data']['id'],
                            post['id'])

            cursor2 = conn.cursor()
            cursor2.execute(
                "SELECT * FROM booru_posts WHERE id = %s AND service = 'patreon'",
                (post['id'], ))
            existing_posts = cursor2.fetchall()
            if len(existing_posts) > 0:
                continue

            post_model = {
                'id': post['id'],
                '"user"': post['relationships']['user']['data']['id'],
                'service': 'patreon',
                'title': post['attributes']['title'],
                'content': '',
                'embed': {},
                'shared_file': False,
                'added': datetime.datetime.now(),
                'published': post['attributes']['published_at'],
                'edited': post['attributes']['edited_at'],
                'file': {},
                'attachments': []
            }

            if post['attributes']['content']:
                post_model['content'] = post['attributes']['content']
                for image in text.extract_iter(post['attributes']['content'],
                                               '<img data-media-id="', '>'):
                    download_url = text.extract(image, 'src="', '"')[0]
                    path = urlparse(download_url).path
                    ext = splitext(path)[1]
                    fn = str(uuid.uuid4()) + ext
                    filename, _ = download_file(join(config.download_path,
                                                     'inline'),
                                                download_url,
                                                name=fn)
                    post_model['content'] = post_model['content'].replace(
                        download_url, f"/inline/{filename}")

            if post['attributes']['embed']:
                post_model['embed']['subject'] = post['attributes']['embed'][
                    'subject']
                post_model['embed']['description'] = post['attributes'][
                    'embed']['description']
                post_model['embed']['url'] = post['attributes']['embed']['url']

            if post['attributes']['post_file']:
                filename, _ = download_file(
                    join(config.download_path, file_directory),
                    post['attributes']['post_file']['url'],
                    name=post['attributes']['post_file']['name'])
                post_model['file']['name'] = post['attributes']['post_file'][
                    'name']
                post_model['file']['path'] = f'/{file_directory}/{filename}'

            for attachment in post['relationships']['attachments']['data']:
                filename, _ = download_file(
                    join(config.download_path, attachments_directory),
                    f"https://www.patreon.com/file?h={post['id']}&i={attachment['id']}",
                    cookies={'session_id': key})
                post_model['attachments'].append({
                    'name':
                    filename,
                    'path':
                    f'/{attachments_directory}/{filename}'
                })

            if post['relationships']['images']['data']:
                for image in post['relationships']['images']['data']:
                    for media in list(
                            filter(
                                lambda included: included['id'] == image['id'],
                                scraper_data['included'])):
                        if media['attributes']['state'] != 'ready':
                            continue
                        filename, _ = download_file(
                            join(config.download_path, attachments_directory),
                            media['attributes']['download_url'],
                            name=media['attributes']['file_name'])
                        post_model['attachments'].append({
                            'name':
                            filename,
                            'path':
                            f'/{attachments_directory}/{filename}'
                        })

            if post['relationships']['audio']['data']:
                for audio in post['relationships']['audio']['data']:
                    for media in list(
                            filter(
                                lambda included: included['id'] == audio['id'],
                                scraper_data['included'])):
                        if media['attributes']['state'] != 'ready':
                            continue
                        filename, _ = download_file(
                            join(config.download_path, attachments_directory),
                            media['attributes']['download_url'],
                            name=media['attributes']['file_name'])
                        post_model['attachments'].append({
                            'name':
                            filename,
                            'path':
                            f'/{attachments_directory}/{filename}'
                        })

            post_model['embed'] = json.dumps(post_model['embed'])
            post_model['file'] = json.dumps(post_model['file'])
            for i in range(len(post_model['attachments'])):
                post_model['attachments'][i] = json.dumps(
                    post_model['attachments'][i])

            columns = post_model.keys()
            data = ['%s'] * len(post_model.values())
            data[-1] = '%s::jsonb[]'  # attachments
            query = "INSERT INTO booru_posts ({fields}) VALUES ({values})".format(
                fields=','.join(columns), values=','.join(data))
            cursor3 = conn.cursor()
            cursor3.execute(query, list(post_model.values()))
            conn.commit()
        except DownloaderException:
            continue

    conn.close()
    if scraper_data['links'].get('next'):
        import_posts(key, 'https://' + scraper_data['links']['next'])