def download_file(url, data_path='.', filename=None, size=None, chunk_size=4096, verbose=True): """Uses stream=True and a reasonable chunk size to be able to download large (GB) files over https""" if filename is None: filename = dropbox_basename(url) file_path = os.path.join(data_path, filename) if url.endswith('?dl=0'): url = url[:-1] + '1' # noninteractive download if verbose: tqdm_prog = tqdm print('requesting URL: {}'.format(url)) else: tqdm_prog = no_tqdm r = requests_get(url, stream=True, allow_redirects=True, timeout=5) size = r.headers.get('Content-Length', None) if size is None else size print('remote size: {}'.format(size)) stat = path_status(file_path) print('local size: {}'.format(stat.get('size', None))) if stat['type'] == 'file' and stat['size'] == size: # TODO: check md5 or get the right size of remote file r.close() return file_path print('Downloading to {}'.format(file_path)) with open(file_path, 'wb') as f: for chunk in r.iter_content(chunk_size=chunk_size): if chunk: # filter out keep-alive chunks f.write(chunk) r.close() return file_path
def geocode_google(address, apikey=None): apikey = apikey or 'AIzaSyC--s1-y1xkIxzO7wfIUOeHm8W-ID9fbfM' # this is a Total Good API key, GET YOUR OWN! google_url = 'https://maps.googleapis.com/maps/api/geocode/json?address={address}&key={apikey}'.format( address=address, apikey=apikey) resp = requests_get(google_url, allow_redirects=True, timeout=5) results = resp.json() results = results.get('results', {}) results = [{}] if not len(results) else results latlon = results[0].get('geometry', {}).get('location', {}) return { 'lat': latlon.get('lat', pd.np.nan), 'lon': latlon.get('lng', pd.np.nan), }
def generate_download_mccauley(): # amazon product reviews for recommendation engine training and review sentiment analysis response = requests_get('http://jmcauley.ucsd.edu/data/amazon/', allow_redirects=True, timeout=5) urls_product_review = [ m[0] for m in regexes.cre_url_popular.findall(response.text) if m[0].lower().endswith('.json.gz') ] response = requests_get('http://jmcauley.ucsd.edu/data/amazon/qa/', allow_redirects=True, timeout=5) urls_question_answer = [ m[1] for m in regexes.cre_href.findall(response.text) if m[1].lower().endswith('.json.gz') ] with open('download_mccauley_autogenerated.sh', 'w') as f: for pr in urls_product_review: f.write('wget ' + pr) for qa in urls_question_answer: f.write('wget ' + qa)
def minify_urls(filepath, ext='asc', url_regex=None, output_ext='.urls_minified', access_token=None): """ Use bitly or similar minifier to shrink all URLs in text files within a folder structure. Used for the NLPIA manuscript directory for Manning Publishing bitly API: https://dev.bitly.com/links.html Args: path (str): Directory or file path ext (str): File name extension to filter text files by. default='.asc' output_ext (str): Extension to append to filenames of altered files default='' (in-place replacement of URLs) FIXME: NotImplementedError! Untested! """ access_token = access_token or secrets.bitly.access_token output_ext = output_ext or '' url_regex = regex.compile(url_regex) if isinstance(url_regex, str) else url_regex filemetas = [] for filemeta in find_files(filepath, ext=ext): filemetas += [filemeta] altered_text = '' with open(filemeta['path'], 'rt') as fin: text = fin.read() end = 0 for match in url_regex.finditer(text): url = match.group() start = match.start() altered_text += text[:start] resp = requests_get( 'https://api-ssl.bitly.com/v3/shorten?access_token={}&longUrl={}' .format(access_token, url), allow_redirects=True, timeout=5) js = resp.json() short_url = js['shortUrl'] altered_text += short_url end = start + len(url) altered_text += text[end:] with open(filemeta['path'] + (output_ext or ''), 'wt') as fout: fout.write(altered_text) return altered_text
def geocode_osm(address, polygon=0): polygon = int(polygon) address = address.replace(' ', '+').replace('\r\n', ',').replace('\r', ',').replace('\n', ',') osm_url = 'http://nominatim.openstreetmap.org/search' osm_url += '?q={address}&format=json&polygon={polygon}&addressdetails={addressdetails}'.format( address=address, polygon=polygon, addressdetails=0) print(osm_url) resp = requests_get(osm_url, timeout=5) print(resp) d = resp.json() print(d) return { 'lat': d[0].get('lat', np.nan), 'lon': d[0].get('lon', np.nan), }