Example #1
0
def download_file(url, data_path='.', filename=None, size=None, chunk_size=4096, verbose=True):
    """Uses stream=True and a reasonable chunk size to be able to download large (GB) files over https"""
    if filename is None:
        filename = dropbox_basename(url)
    file_path = os.path.join(data_path, filename)
    if url.endswith('?dl=0'):
        url = url[:-1] + '1'  # noninteractive download
    if verbose:
        tqdm_prog = tqdm
        print('requesting URL: {}'.format(url))
    else:
        tqdm_prog = no_tqdm
    r = requests_get(url, stream=True, allow_redirects=True, timeout=5)
    size = r.headers.get('Content-Length', None) if size is None else size
    print('remote size: {}'.format(size))

    stat = path_status(file_path)
    print('local size: {}'.format(stat.get('size', None)))
    if stat['type'] == 'file' and stat['size'] == size:  # TODO: check md5 or get the right size of remote file
        r.close()
        return file_path

    print('Downloading to {}'.format(file_path))

    with open(file_path, 'wb') as f:
        for chunk in r.iter_content(chunk_size=chunk_size):
            if chunk:  # filter out keep-alive chunks
                f.write(chunk)

    r.close()
    return file_path
Example #2
0
def geocode_google(address, apikey=None):
    apikey = apikey or 'AIzaSyC--s1-y1xkIxzO7wfIUOeHm8W-ID9fbfM'  # this is a Total Good API key, GET YOUR OWN!
    google_url = 'https://maps.googleapis.com/maps/api/geocode/json?address={address}&key={apikey}'.format(
        address=address, apikey=apikey)
    resp = requests_get(google_url, allow_redirects=True, timeout=5)
    results = resp.json()
    results = results.get('results', {})
    results = [{}] if not len(results) else results
    latlon = results[0].get('geometry', {}).get('location', {})
    return {
        'lat': latlon.get('lat', pd.np.nan),
        'lon': latlon.get('lng', pd.np.nan),
    }
def generate_download_mccauley():
    # amazon product reviews for recommendation engine training and review sentiment analysis
    response = requests_get('http://jmcauley.ucsd.edu/data/amazon/',
                            allow_redirects=True,
                            timeout=5)
    urls_product_review = [
        m[0] for m in regexes.cre_url_popular.findall(response.text)
        if m[0].lower().endswith('.json.gz')
    ]

    response = requests_get('http://jmcauley.ucsd.edu/data/amazon/qa/',
                            allow_redirects=True,
                            timeout=5)
    urls_question_answer = [
        m[1] for m in regexes.cre_href.findall(response.text)
        if m[1].lower().endswith('.json.gz')
    ]

    with open('download_mccauley_autogenerated.sh', 'w') as f:
        for pr in urls_product_review:
            f.write('wget ' + pr)
        for qa in urls_question_answer:
            f.write('wget ' + qa)
Example #4
0
def minify_urls(filepath,
                ext='asc',
                url_regex=None,
                output_ext='.urls_minified',
                access_token=None):
    """ Use bitly or similar minifier to shrink all URLs in text files within a folder structure.

    Used for the NLPIA manuscript directory for Manning Publishing

    bitly API: https://dev.bitly.com/links.html

    Args:
      path (str): Directory or file path
      ext (str): File name extension to filter text files by. default='.asc'
      output_ext (str): Extension to append to filenames of altered files default='' (in-place replacement of URLs)

    FIXME: NotImplementedError! Untested!
    """
    access_token = access_token or secrets.bitly.access_token
    output_ext = output_ext or ''
    url_regex = regex.compile(url_regex) if isinstance(url_regex,
                                                       str) else url_regex
    filemetas = []
    for filemeta in find_files(filepath, ext=ext):
        filemetas += [filemeta]
        altered_text = ''
        with open(filemeta['path'], 'rt') as fin:
            text = fin.read()
        end = 0
        for match in url_regex.finditer(text):
            url = match.group()
            start = match.start()
            altered_text += text[:start]
            resp = requests_get(
                'https://api-ssl.bitly.com/v3/shorten?access_token={}&longUrl={}'
                .format(access_token, url),
                allow_redirects=True,
                timeout=5)
            js = resp.json()
            short_url = js['shortUrl']
            altered_text += short_url
            end = start + len(url)
        altered_text += text[end:]
        with open(filemeta['path'] + (output_ext or ''), 'wt') as fout:
            fout.write(altered_text)
    return altered_text
def geocode_osm(address, polygon=0):
    polygon = int(polygon)
    address = address.replace(' ', '+').replace('\r\n', ',').replace('\r', ',').replace('\n', ',')
    osm_url = 'http://nominatim.openstreetmap.org/search'
    osm_url += '?q={address}&format=json&polygon={polygon}&addressdetails={addressdetails}'.format(
        address=address, polygon=polygon, addressdetails=0)

    print(osm_url)
    resp = requests_get(osm_url, timeout=5)
    print(resp)
    d = resp.json()
    print(d)

    return {
        'lat': d[0].get('lat', np.nan),
        'lon': d[0].get('lon', np.nan),
    }