def download(url):
    response = requests.get(url, stream=True)
    filename = parse_requests_response(response).filename_unsafe

    if filename is None:
        raise Exception('No filename could be found for this URL')

    filename = sanitize(filename)

    with open(filename, 'wb') as f:
        total = response.headers.get('content-length')

        if total is None:
            f.write(response.content)
        else:
            downloaded = 0
            total = int(total)
            for data in response.iter_content(
                    chunk_size=max(int(total / 1000), 1024 * 1024)):
                downloaded += len(data)
                f.write(data)
                done = int(50 * downloaded / total)
                sys.stdout.write('\r[{}{}]'.format('█' * done,
                                                   '.' * (50 - done)))
                sys.stdout.flush()
    sys.stdout.write('\n')
    return filename
Esempio n. 2
0
def get_attachment(url):
    url = utilities.fix_google_drive_download_url(url)
    url = utilities.fix_dropbox_download_url(url)

    response = requests.get(url)
    response.raise_for_status()
    mimetype = response.headers.get("content-type").lower().split(";")[0].strip()
    filename = rfc6266.parse_requests_response(response).filename_unsafe

    if "X-Auto-Login" in response.headers:
        raise Exception("Login needed for {0}".format(url))

    if not mimetype.startswith("text/") and not mimetype.startswith("image/"):
        raise Exception("Unhandled file type {0}, {1}, {2}".format(url, mimetype, filename))

    if mimetype.startswith("text/"):
        response.encoding = "utf-8"

    attachment = {
        "name": filename,
        "mime": mimetype,
        "url": response.url
    }

    if mimetype.startswith("text/"):
        attachment["content"] = response.text
    else:
        attachment["content"] = response.content

    return attachment
Esempio n. 3
0
    def download(self):
        if self.source_url:
            if self.is_google_doc():
                get_google_doc(self)
            else:
                r = requests.get(
                    self.source_url,
                    headers={'User-Agent': 'Cove (cove.opendataservice.coop)'})
                r.raise_for_status()
                content_type = r.headers.get('content-type',
                                             '').split(';')[0].lower()
                file_extension = CONTENT_TYPE_MAP.get(content_type)

                if not file_extension:
                    possible_extension = rfc6266.parse_requests_response(
                        r).filename_unsafe.split('.')[-1]
                    if possible_extension in CONTENT_TYPE_MAP.values():
                        file_extension = possible_extension

                file_name = r.url.split('/')[-1].split('?')[0][:100]
                if file_name == '':
                    file_name = 'file'
                if file_extension:
                    if not file_name.endswith(file_extension):
                        file_name = file_name + '.' + file_extension
                self.original_file.save(file_name, ContentFile(r.content))
        else:
            raise ValueError('No source_url specified.')
Esempio n. 4
0
def get_attachment(url):
    url = utilities.fix_google_drive_download_url(url)
    url = utilities.fix_dropbox_download_url(url)

    response = requests.get(url)
    response.raise_for_status()
    mimetype = response.headers.get("content-type").lower().split(
        ";")[0].strip()
    filename = rfc6266.parse_requests_response(response).filename_unsafe

    if "X-Auto-Login" in response.headers:
        raise Exception("Login needed for {0}".format(url))

    if not mimetype.startswith("text/") and not mimetype.startswith("image/"):
        raise Exception("Unhandled file type {0}, {1}, {2}".format(
            url, mimetype, filename))

    if mimetype.startswith("text/"):
        response.encoding = "utf-8"

    attachment = {"name": filename, "mime": mimetype, "url": response.url}

    if mimetype.startswith("text/"):
        attachment["content"] = response.text
    else:
        attachment["content"] = response.content

    return attachment
Esempio n. 5
0
 def filename_from_content_disposition(requests_response):
     """
     Parses the RFC6266 content-disposition header to determine the server-
     suggested filename for content.
     """
     components = urlparse(requests_response.url)
     head, tail = posixpath.split(components.path)
     expected_extension = posixpath.splitext(tail)[1]
     cd = rfc6266.parse_requests_response(requests_response)
     return cd.filename_sanitized(expected_extension.lstrip('.') or 'dat')
Esempio n. 6
0
def save_response(response):
    tmp = tempdir()
    filename = secure_filename(
        parse_requests_response(response).filename_unsafe)
    filepath = os.path.join(tmp, filename)

    with open(filepath, 'wb') as out:
        copyfileobj(response.raw, out)

    return filepath
Esempio n. 7
0
 def handleMatch(self, match):
     el = super(ImageDownloadPattern, self).handleMatch(match)
     urlparts = urllib.parse.urlparse(el.attrib["src"])
     if urlparts.netloc:
         response = requests.get(urlparts.geturl())
         response.raise_for_status()
         filename = rfc6266.parse_requests_response(response).filename_unsafe
         with open(os.path.join(settings.get("folder"), filename), "wb") as f:
             f.write(response.content)
             el.attrib["src"] = filename
         utilities.fix_image(os.path.join(settings.get("folder"), filename), settings.get("features")["width"])
     return el
Esempio n. 8
0
def download_originals() -> Generator:
    """
    Download the original images from IMGS_URL
    Return: the total of downloaded images.
    """
    r = requests.get(IMGS_URL)
    j = json.loads(r.text)

    for img in j['images']:
        r = requests.get(img['url'])
        stream = BytesIO(r.content)
        img = Image.open(stream)
        img.filename = rfc6266.parse_requests_response(r).filename_unsafe
        yield img
Esempio n. 9
0
def download_file(url,
                  dest=None,
                  chunk_size=1024,
                  replace="ask",
                  label="Downloading {dest_basename} ({size:.2f}MB)",
                  expected_extension=None):
    """Download a file from a given URL and display progress.

    :param dest: If the destination exists and is a directory, the filename
        will be guessed from the Content-Disposition header. If the destination
        is an existing file, the user will either be prompted to overwrite, or
        the file will be replaced (depending on the value of **replace**). If
        the destination does not exist, it will be used as the filename.
    :param int chunk_size: bytes read in at a time.
    :param replace: If `False`, an existing destination file will not be
        overwritten.
    :param label: a string which is formatted and displayed as the progress bar
        label. Variables provided include *dest_basename*, *dest*, and *size*.
    :param expected_extension: if set, the filename will be sanitized to ensure
        it has the given extension. The extension should not start with a dot
        (`.`).
    """
    dest = Path(dest or url.split("/")[-1])
    response = get(url, stream=True)
    if (dest.exists() and dest.is_dir()
            and "Content-Disposition" in response.headers):
        content_disposition = rfc6266.parse_requests_response(response)
        if expected_extension is not None:
            filename = content_disposition.filename_sanitized(
                expected_extension)
        filename = secure_filename(filename)
        dest = dest / filename
    if dest.exists() and not dest.is_dir():
        if (replace is False or replace == "ask"
                and not click.confirm("Replace {}?".format(dest))):
            return str(dest)
    size = int(response.headers.get("content-length", 0))
    label = label.format(dest=dest,
                         dest_basename=dest.name,
                         size=size / 1024.0 / 1024)
    with click.open_file(str(dest), "wb") as f:
        content_iter = response.iter_content(chunk_size=chunk_size)
        with click.progressbar(content_iter, length=size / 1024,
                               label=label) as bar:
            for chunk in bar:
                if chunk:
                    f.write(chunk)
                    f.flush()
    return str(dest)
Esempio n. 10
0
def get_filename_from_request(r):
    filename = rfc6266.parse_requests_response(r).filename_unsafe
    extention = re.search('\.\w\w\w$', filename)
    if extention:
        return filename
    else:
        try:
            value, params = cgi.parse_header(r.headers['Content-Disposition'])
            return params.get('filename') or params.get('filename*').replace(
                "UTF-8''", '')
        except KeyError:
            assert get_url_queries(r.url), 'No filename could be extracted'
            url = strip_url_queries(r.url)
            r = requests.get(url, stream=True)
            return get_filename_from_request(r)
Esempio n. 11
0
 def handleMatch(self, match):
     el = super(ImageDownloadPattern, self).handleMatch(match)
     urlparts = urllib.parse.urlparse(el.attrib["src"])
     if urlparts.netloc:
         response = requests.get(urlparts.geturl())
         response.raise_for_status()
         filename = rfc6266.parse_requests_response(
             response).filename_unsafe
         with open(os.path.join(settings.get("folder"), filename),
                   "wb") as f:
             f.write(response.content)
             el.attrib["src"] = filename
         utilities.fix_image(os.path.join(settings.get("folder"), filename),
                             settings.get("features")["width"])
     return el
Esempio n. 12
0
def download_file(url, dest=None, chunk_size=1024, replace="ask",
                  label="Downloading {dest_basename} ({size:.2f}MB)",
                  expected_extension=None):
    """Download a file from a given URL and display progress.

    :param dest: If the destination exists and is a directory, the filename
        will be guessed from the Content-Disposition header. If the destination
        is an existing file, the user will either be prompted to overwrite, or
        the file will be replaced (depending on the value of **replace**). If
        the destination does not exist, it will be used as the filename.
    :param int chunk_size: bytes read in at a time.
    :param replace: If `False`, an existing destination file will not be
        overwritten.
    :param label: a string which is formatted and displayed as the progress bar
        label. Variables provided include *dest_basename*, *dest*, and *size*.
    :param expected_extension: if set, the filename will be sanitized to ensure
        it has the given extension. The extension should not start with a dot
        (`.`).
    """
    dest = Path(dest or url.split("/")[-1])
    response = get(url, stream=True)
    if (dest.exists()
            and dest.is_dir()
            and "Content-Disposition" in response.headers):
        content_disposition = rfc6266.parse_requests_response(response)
        if expected_extension is not None:
            filename = content_disposition.filename_sanitized(
                expected_extension)
        filename = secure_filename(filename)
        dest = dest / filename
    if dest.exists() and not dest.is_dir():
        if (replace is False
                or replace == "ask"
                and not click.confirm("Replace {}?".format(dest))):
            return str(dest)
    size = int(response.headers.get("content-length", 0))
    label = label.format(dest=dest, dest_basename=dest.name,
                         size=size/1024.0/1024)
    with click.open_file(str(dest), "wb") as f:
        content_iter = response.iter_content(chunk_size=chunk_size)
        with click.progressbar(content_iter, length=size/1024,
                               label=label) as bar:
            for chunk in bar:
                if chunk:
                    f.write(chunk)
                    f.flush()
    return str(dest)
Esempio n. 13
0
def fetch(url):

    response = requests.get(url)
    url = urlparse(response.url)  # Follow redirects

    if response.status_code not in range(200, 299):
        sys.exit("Bad response {}".format(response.status_code))

    # If Content-Type header is not found we assume MP3
    content_type = response.headers.get('content-type', 'audio/mpeg')
    if not content_type.startswith("audio/"):
        puts_err("Bad content-type")

    extension = mimetypes.guess_extension(content_type)
    content_disposition = parse_requests_response(response)
    filename = content_disposition.filename_sanitized(extension.lstrip('.'))

    expected_ittimes = None
    content_length = response.headers.get('content-length')
    iter_content = response.iter_content(CHUNK_SIZE)

    puts(colored.blue(u"Downloading {}\n".format(filename)))

    # Display progress bar if content-length is set
    if content_length is not None:
        expected_ittimes = int(content_length) / CHUNK_SIZE
        iter_content = progress.bar(iter_content, expected_size=expected_ittimes)

    with tempfile.NamedTemporaryFile(prefix='song-', suffix=extension, delete=False) \
             as fp:
        for chunk in iter_content:
            fp.write(chunk)

    proc = subprocess.Popen(['osascript', '-'],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
    out, err = proc.communicate(add_to_itunes.format(path=fp.name))

    if proc.returncode:
        puts_err("Couldn't add to iTunes ({})".format(err))
        sys.exit(proc.returncode)

    match = re.match(_itunes_stdout_re, out)
    if match is not None:
        song, playlist, source = match.groups()
        puts(colored.cyan("Added {} to iTunes!".format(song)))
Esempio n. 14
0
def download(url, name=""):
    """
    Download from url, extracts filename from url or content disposition.

    Returns original fname, sanitized one and content of file
    Or None, None, None
    """
    resp = requests.get(url)

    if resp.status_code == 200:
        parsed = parse_requests_response(resp)
        if name:
            fname = name + os.path.splitext(parsed.filename_unsafe)[-1]
        else:
            fname = parsed.filename_sanitized(
                os.path.splitext(parsed.filename_unsafe)[-1].strip("."))

        return parsed.filename_unsafe, fname, resp.content
    else:
        return None, None, None
def get_filename(response, expected_extension):
    """Get filename from content-disposition header"""
    try:
        content_disposition = rfc6266.parse_requests_response(response)
    except Exception as error:  # lets not depend on the dependencies of rfc6266
        logger.warning(
            'Failed to parse content_disposition header from %r, error: %r',
            response.url, error)
        # fall back on guessing the filename from the URL
        basename = requests.utils.unquote_unreserved(response.url).rsplit(
            '/', 1)[-1]
        return (basename or 'file') + '.' + expected_extension

    extension = content_disposition.filename_unsafe.rsplit('.', 1)[-1]
    # allow for some common file extension variations
    safe_aliases = (extension.lower(), extension.lower().replace(
        'jpeg', 'jpg'), extension.lower().replace('jpg', 'jpeg'))
    if expected_extension in safe_aliases:
        expected_extension = extension

    return content_disposition.filename_sanitized(expected_extension)
Esempio n. 16
0
    def postDownloadCsv(self, marketCode, commodity, commodity2, setMon,
                        pcCode):
        print('.', end='')
        payload = {
            'captcha': '',
            'commodity_id2t': str(commodity2),
            'commodity_idt': str(commodity),
            'commodityId': str(commodity),
            'commodityId2': str(commodity2),
            'curpage': '1',
            'doQuery': '1',
            'doQueryPage': '',
            'marketcode': str(marketCode),
            'MarketCode': str(marketCode),
            'pccode': str(pcCode),
            'queryDate': str(self.QueryDate),
            'queryDateAh': str(self.QueryDateAh),
            'settlemon': str(setMon),
            'totalpage': ''
        }

        res = self.session.post(self.DownURL,
                                data=payload,
                                headers=self.header,
                                cookies=self.cookies)

        if res.status_code != requests.codes.ok:
            raise Exception("Post Download Failed")

        if res.headers.get('Content-Disposition') == None:
            print('Download Failed', marketCode, commodity, commodity2, setMon,
                  pcCode)
            return

        fileName = rfc6266.parse_requests_response(res).filename_unsafe

        with open(self.directory + fileName, 'wb') as fd:
            for chunk in res.iter_content(256):
                fd.write(chunk)
Esempio n. 17
0
File: utils.py Progetto: Ndersam/QDM
	def extract_downloadinfo(self):
		''' Extracts from information from url '''
		protocol = get_fileprotocol(self.url)
		if protocol == 'http' or protocol == 'https':
			file = File()
			try:
				proxies = http_proxies()
				r = requests_retry_session().get(self.url, stream=True, proxies=proxies)
				r.raise_for_status()
				name =rfc6266.parse_requests_response(r).filename_unsafe
				file.name = name
				file.url = self.url
				try:
					file.size = int(r.headers['Content-Length'])
				except:
					file.size = 0
					file.setResume(0)
				return file
			except Exception as e:
				print(e.args[0])
				return file
		else:
			print(protocol, 'not supported yet!')
			return None
Esempio n. 18
0
def main():
    for item in parse_cmake_dependencies():
        if isinstance(item, GitRepository):
            repo_url, tag = item

            repo_name = repo_url.rstrip("/").split("/")[-1].replace(".git", "")

            with TemporaryDirectory(prefix="AppImageKit-") as tempdir:
                log("Cloning Git repository: {}".format(repo_url))
                porcelain.clone(repo_url, tempdir)

                # TODO: replace with dulwich solution
                # maybe have a look at Pext source code
                version = subprocess.check_output(
                    [
                        "git",
                        "describe",
                        "--always",
                        "--tags",
                        tag,
                    ],
                    cwd=tempdir).decode().split("\n")[0]

                tarball_name = "{}-{}.tar.gz".format(repo_name, version)

                if os.path.exists(tarball_name):
                    log("Warning: {} exists, skipping".format(tarball_name))
                    continue

                tarball_path = os.path.join(tempdir, tarball_name)

                log("Creating tarball for tag/branch {}: {}".format(
                    tag, tarball_name))

                # TODO: replace with dulwich call to remove dependency on Git
                # binary
                subprocess.check_call([
                    "git",
                    "archive",
                    "--format",
                    "tar.gz",
                    "-o",
                    tarball_path,
                    "--prefix",
                    "{}/".format(repo_name),
                    tag,
                ],
                                      cwd=tempdir)

                destination = os.path.join(os.getcwd(), "sources")

                shutil.copyfile(tarball_path,
                                os.path.join(destination, tarball_name))

        elif isinstance(item, TarballURL) or isinstance(item, PatchURL):
            if isinstance(item, TarballURL):
                url, hash = item
            elif isinstance(item, PatchURL):
                url, hash = item[0], None
            else:
                url, hash = str(item), None

            log("Downloading URL: {}".format(url))

            digest = None
            hash_algorithm = hash_value = None

            if hash is not None:
                hash_algorithm, hash_value = hash

                if hash_algorithm not in hashlib.algorithms_available:
                    log("Warning: hashing algorithm {} not supported by "
                        "Python interpreter".format(hash_algorithm))
                    hash_algorithm = None

                else:
                    digest = hashlib.new(hash_algorithm.upper())

            response = requests.get(url, stream=True)
            response.raise_for_status()

            content_disposition = rfc6266.parse_requests_response(response)

            ext = os.path.splitext(content_disposition.filename_unsafe)[-1]
            filename = content_disposition.filename_sanitized(ext.strip("."))

            if isinstance(item, TarballURL):
                path = os.path.join("sources", filename)
            elif isinstance(item, PatchURL):
                path = os.path.join("patches", filename)
            else:
                path = filename

            try:
                total = int(response.headers.get("Content-Length", None))
            except (ValueError, TypeError):
                total = None

            if os.path.exists(path):
                # if a hash value is available, use that to verify whether
                # file on system is up to date
                if hash_algorithm is not None:
                    local_digest = hashlib.new(hash_algorithm)

                    with open(path, "rb") as f:
                        data = f.read(4096)

                        if not data:
                            break

                        local_digest.update(data)

                    if hash_value == local_digest.hexdigest():
                        log("Warning: file {} exists, "
                            "skipping download".format(path))
                        continue

                if total is None:
                    log("Warning: size of file {} unknown, overwriting local "
                        "file".format(path))
                else:
                    if os.path.getsize(path) == total:
                        log("Warning: file {} exists, "
                            "skipping download".format(path))
                        continue

            os.makedirs(os.path.dirname(path), exist_ok=True)

            with open(path, "wb") as f:
                with tqdm(total=total) as pbar:
                    for chunk in response.iter_content():
                        f.write(chunk)

                        if digest is not None:
                            digest.update(chunk)

                        pbar.update(len(chunk))

            if digest is not None:
                if hash_value != digest.hexdigest():
                    log("Warning: could not verify file integrity: "
                        "expected digest: {}, received: {} "
                        "".format(hash_value, digest.hexdigest()))
Esempio n. 19
0
def main(argv):
    parser = argparse.ArgumentParser(description='litres.ru backup tool')
    parser.add_argument("-u", "--user", help="Username")
    parser.add_argument("-p", "--password", help="Password")
    parser.add_argument("-f",
                        "--format",
                        default="ios.epub",
                        help="Downloading format. 'list' for available")
    parser.add_argument("-d",
                        "--debug",
                        action="store_true",
                        help="Add debug output")
    parser.add_argument("-v",
                        "--verbosedebug",
                        action="store_true",
                        help="You really want to see what happens?")
    args = parser.parse_args()

    if args.format == 'list':
        for f in FORMATS:
            print f
        exit(0)
    else:
        if args.format not in FORMATS:
            print "I dont know this format: " + args.format
            exit(1)

    if str(args.user) == 'None' or str(args.password) == 'None':
        print "I cant work without username and passwords"
        exit(1)

    if args.debug:
        print "Will ask for downloading " + args.format
        print "Try to login to site as " + args.user

    r = requests.post(URL + "catalit_authorise/",
                      data={
                          'login': args.user,
                          'pwd': args.password
                      })
    if args.debug:
        print "Responce : ", r.status_code, r.reason
        print "Responce text : " + r.text

    root = ET.fromstring(r.content)

    if root.tag == "catalit-authorization-failed":
        print "Authorization failed"
        exit(1)

    sid = root.attrib['sid']
    if args.debug:
        print "Welcome, ", root.attrib['login'], "(", root.attrib['mail'], ")"
        print "Asking litres.ru for list of books (can take a some time)"
        print "sid ", sid

    r = requests.post(URL + "catalit_browser/",
                      data={
                          'sid': sid,
                          'my': "1",
                          'limit': "0,1000"
                      })

    if args.verbosedebug:
        print "Responce ", r.status_code, r.reason
        print "Responce text ", r.text

    root = ET.fromstring(r.content)

    count_total = root.attrib['records']
    if args.debug:
        print "Total books: ", count_total

    if args.verbosedebug:
        print root.tag, root.attrib

    count = 1

    for child in root:
        if args.verbosedebug:
            print child.tag, child.attrib
        hub_id = child.attrib['hub_id']
        file_size = 0

        for elem in child.iter():
            if elem.tag == 'file' and elem.attrib['type'] == args.format:
                file_size = elem.attrib['size']
            if args.verbosedebug:
                print elem.tag, elem.attrib, elem.text, file_size

        r = requests.post(URL + "catalit_download_book/",
                          data={
                              'sid': sid,
                              'art': hub_id,
                              'type': args.format
                          },
                          stream=True)

        if args.debug:
            print "Responce ", r.status_code, r.reason

        filename = rfc6266.parse_requests_response(r).filename_unsafe
        print "(", count, "/", count_total, ")", filename
        with open(filename, "wb") as handle:
            for data in tqdm(r.iter_content(), unit='b', total=int(file_size)):
                handle.write(data)
        time.sleep(1)  # do not DDoS litres.
        count = count + 1

    r = requests.get(URL_www + "/pages/my_books_fresh/", cookies={'SID': sid})

    items = ET.HTML(r.content).xpath("//div[contains(@class, 'art-item')]")

    for item in items:
        link = item.xpath(
            ".//a[contains(@class, 'art-buttons__read_purchased')]")
        info = item.xpath(".//div[@data-obj]")
        if len(link) != 1 or len(info) != 1:
            continue
        link = link[0]
        info = info[0]
        if args.verbosedebug:
            print "Book link", link.attrib['href']
            print "Book info", info.attrib['data-obj']
        data_obj = dict(demjson.decode(info.attrib['data-obj']))
        book_name = data_obj['author'] + '_' + data_obj['alt']
        fid = re.search(r"file=(\d+)&", link.attrib['href']).group(1)
        while len(fid) < 8:
            fid = "0" + fid
        m = re.match(r"(\d\d)(\d\d)(\d\d)(\d\d)", fid)
        r = requests.get(URL_www + "/static/pdfjs/" + m.group(1) + "/" +
                         m.group(2) + "/" + m.group(3) + "/" + fid + ".js",
                         cookies={'SID': sid})
        m = re.search(r"=\s(\{.+\});", r.text)
        js_obj = dict(demjson.decode(m.group(1)))
        max_w_index = 0
        for i, page in enumerate(js_obj['pages']):
            if page['p'][0]['w'] > js_obj['pages'][max_w_index]['p'][0]['w']:
                max_w_index = i
        pages = js_obj['pages'][max_w_index]['p']
        rt = js_obj['pages'][max_w_index]['rt']
        os.mkdir(TMP_DIR)
        imgs = []
        for i, page in enumerate(pages):
            r = requests.get(URL_www + "/pages/read_book_online/?file=" + fid +
                             "&page=" + str(i) + "&rt=" + rt + "&ft=" +
                             page['ext'],
                             cookies={'SID': sid})
            img = TMP_DIR + '/' + str(i) + '.' + str(page['ext'])
            with open(img, 'wb') as f:
                f.write(r.content)
            imgs.append(img)
            if i % 10 == 0:
                time.sleep(1)
        with open(book_name + '.pdf', "wb") as f:
            f.write(img2pdf.convert(imgs))
        shutil.rmtree(TMP_DIR)
def main():
    for item in parse_cmake_dependencies():
        if isinstance(item, GitRepository):
            repo_url, tag = item

            repo_name = repo_url.rstrip("/").split("/")[-1].replace(".git", "")

            with TemporaryDirectory(prefix="AppImageKit-") as tempdir:
                log("Cloning Git repository: {}".format(repo_url))
                porcelain.clone(repo_url, tempdir)

                # TODO: replace with dulwich solution
                # maybe have a look at Pext source code
                version = subprocess.check_output([
                    "git", "describe", "--always", "--tags", tag,
                ], cwd=tempdir).decode().split("\n")[0]

                tarball_name = "{}-{}.tar.gz".format(repo_name, version)

                if os.path.exists(tarball_name):
                    log("Warning: {} exists, skipping".format(tarball_name))
                    continue

                tarball_path = os.path.join(tempdir, tarball_name)

                log("Creating tarball for tag/branch {}: {}".format(
                    tag, tarball_name
                ))

                # TODO: replace with dulwich call to remove dependency on Git
                # binary
                subprocess.check_call([
                    "git", "archive",
                    "--format", "tar.gz",
                    "-o", tarball_path,
                    "--prefix", "{}/".format(repo_name),
                    tag,
                ], cwd=tempdir)

                destination = os.path.join(os.getcwd(), "sources")

                shutil.copyfile(tarball_path,
                                os.path.join(destination, tarball_name))

        elif isinstance(item, TarballURL) or isinstance(item, PatchURL):
            if isinstance(item, TarballURL):
                url, hash = item
            elif isinstance(item, PatchURL):
                url, hash = item[0], None
            else:
                url, hash = str(item), None

            log("Downloading URL: {}".format(url))

            digest = None
            hash_algorithm = hash_value = None

            if hash is not None:
                hash_algorithm, hash_value = hash

                if hash_algorithm not in hashlib.algorithms_available:
                    log("Warning: hashing algorithm {} not supported by "
                        "Python interpreter".format(hash_algorithm))
                    hash_algorithm = None

                else:
                    digest = hashlib.new(hash_algorithm.upper())

            response = requests.get(url, stream=True)
            response.raise_for_status()

            content_disposition = rfc6266.parse_requests_response(response)

            ext = os.path.splitext(content_disposition.filename_unsafe)[-1]
            filename = content_disposition.filename_sanitized(ext.strip("."))

            if isinstance(item, TarballURL):
                path = os.path.join("sources", filename)
            elif isinstance(item, PatchURL):
                path = os.path.join("patches", filename)
            else:
                path = filename

            try:
                total = int(response.headers.get("Content-Length", None))
            except (ValueError, TypeError):
                total = None

            if os.path.exists(path):
                # if a hash value is available, use that to verify whether
                # file on system is up to date
                if hash_algorithm is not None:
                    local_digest = hashlib.new(hash_algorithm)

                    with open(path, "rb") as f:
                        data = f.read(4096)

                        if not data:
                            break

                        local_digest.update(data)

                    if hash_value == local_digest.hexdigest():
                        log("Warning: file {} exists, "
                            "skipping download".format(path))
                        continue

                if total is None:
                    log("Warning: size of file {} unknown, overwriting local "
                        "file".format(path))
                else:
                    if os.path.getsize(path) == total:
                        log("Warning: file {} exists, "
                            "skipping download".format(path))
                        continue

            os.makedirs(os.path.dirname(path), exist_ok=True)

            with open(path, "wb") as f:
                with tqdm(total=total) as pbar:
                    for chunk in response.iter_content():
                        f.write(chunk)

                        if digest is not None:
                            digest.update(chunk)

                        pbar.update(len(chunk))

            if digest is not None:
                if hash_value != digest.hexdigest():
                    log("Warning: could not verify file integrity: "
                        "expected digest: {}, received: {} "
                        "".format(hash_value, digest.hexdigest()))
Esempio n. 21
0
                    'datagetter (https://github.com/ThreeSixtyGiving/datagetter)'
                })
            r.raise_for_status()
        except:
            print("\n\nDownload failed for dataset {}\n".format(
                dataset['identifier']))
            traceback.print_exc()
            exit_status = 1
            metadata['downloads'] = False
        else:
            metadata['downloads'] = True
        content_type = r.headers.get('content-type', '').split(';')[0].lower()
        if content_type and content_type in CONTENT_TYPE_MAP:
            file_type = CONTENT_TYPE_MAP[content_type]
        elif 'content-disposition' in r.headers:
            file_type = rfc6266.parse_requests_response(
                r).filename_unsafe.split('.')[-1]
        else:
            file_type = url.split('.')[-1]
        if file_type not in CONTENT_TYPE_MAP.values():
            print("\n\nUnrecognised file type {}\n".format(file_type))
            continue
        metadata['file_type'] = file_type
        file_name = 'data/original/' + dataset['identifier'] + '.' + file_type
        with open(file_name, 'wb') as fp:
            fp.write(r.content)
    else:
        file_type = metadata['file_type']
        file_name = 'data/original/' + dataset['identifier'] + '.' + file_type

    json_file_name = 'data/json_all/{}.json'.format(dataset['identifier'])
Esempio n. 22
0
def fetch_and_convert(args, dataset):
    r = None

    metadata = dataset.get('datagetter_metadata', {})
    dataset['datagetter_metadata'] = metadata

    if not dataset['license'] in acceptable_licenses + unacceptable_licenses:
        raise ValueError('Unrecognised license ' + dataset['license'])

    url = dataset['distribution'][0]['downloadURL']

    if args.download:
        proxies = None
        metadata[
            'datetime_downloaded'] = strict_rfc3339.now_to_rfc3339_localoffset(
            )
        if args.socks5_proxy:
            proxies = {
                'http': args.socks5_proxy,
                'https': args.socks5_proxy,
            }

        try:
            print("Fetching %s" % url)
            r = requests.get(
                url,
                headers={
                    'User-Agent':
                    'datagetter (https://github.com/ThreeSixtyGiving/datagetter)'
                },
                proxies=proxies)
            r.raise_for_status()

            metadata['downloads'] = True
        except Exception as e:
            if isinstance(e, KeyboardInterrupt):
                raise

            print("\n\nDownload {} failed for dataset {}\n".format(
                url, dataset['identifier']))
            traceback.print_exc()
            metadata['downloads'] = False
            metadata['error'] = str(e)

            if not isinstance(e, requests.exceptions.HTTPError):
                return

        content_type = r.headers.get('content-type', '').split(';')[0].lower()
        if content_type and content_type in CONTENT_TYPE_MAP:
            file_type = CONTENT_TYPE_MAP[content_type]
        elif 'content-disposition' in r.headers:
            file_type = rfc6266.parse_requests_response(
                r).filename_unsafe.split('.')[-1]
        else:
            file_type = url.split('.')[-1]
        if file_type not in CONTENT_TYPE_MAP.values():
            print("\n\nUnrecognised file type {}\n".format(file_type))
            return

        # Check that the downloaded json file is valid json and not junk from the webserver
        # e.g. a 500 error being output without the proper status code.
        if file_type == "json":
            try:
                json.loads(r.text)
            except ValueError:
                print("\n\nJSON file provided by webserver is invalid")
                metadata['downloads'] = False
                metadata['error'] = "Invalid JSON file provided by webserver"
                return

        metadata['file_type'] = file_type

        file_name = args.data_dir + '/original/' + dataset[
            'identifier'] + '.' + file_type
        with open(file_name, 'wb') as fp:
            fp.write(r.content)
    else:
        # --no-download arg

        # We require the metadata to exist, it won't if the file failed to download correctly
        if metadata['downloads'] == False:
            print(
                "Skipping %s as it was not marked as successfully downloaded" %
                dataset['identifier'])
            return

        file_type = metadata['file_type']
        file_name = args.data_dir + '/original/' + dataset[
            'identifier'] + '.' + file_type

    json_file_name = '{}/json_all/{}.json'.format(args.data_dir,
                                                  dataset['identifier'])

    metadata['file_size'] = os.path.getsize(file_name)

    if args.convert and (args.convert_big_files
                         or metadata['file_size'] < 10 * 1024 * 1024):
        if file_type == 'json':
            os.link(file_name, json_file_name)
            metadata['json'] = json_file_name
        else:
            try:
                print("Running convert on %s to %s" %
                      (file_name, json_file_name))
                convert_spreadsheet(file_name, json_file_name, file_type)
            except KeyboardInterrupt:
                raise
            except Exception:
                print(
                    "\n\nUnflattening failed for file {}\n".format(file_name))
                traceback.print_exc()
                metadata['json'] = None
                metadata["valid"] = False
                metadata["error"] = "Could not unflatten file"
            else:
                metadata['json'] = json_file_name

    metadata['acceptable_license'] = dataset['license'] in acceptable_licenses

    # We can only do anything with the JSON if it did successfully convert.
    if metadata.get('json'):
        format_checker = FormatChecker()
        if args.validate:
            try:
                with open(json_file_name, 'r') as fp:
                    validate(json.load(fp),
                             schema,
                             format_checker=format_checker)
            except (ValidationError, ValueError):
                metadata['valid'] = False
            else:
                metadata['valid'] = True

        if metadata['valid']:
            os.link(
                json_file_name,
                '{}/json_valid/{}.json'.format(args.data_dir,
                                               dataset['identifier']))
            data_valid.append(dataset)
            if metadata['acceptable_license']:
                os.link(
                    json_file_name,
                    '{}/json_acceptable_license_valid/{}.json'.format(
                        args.data_dir, dataset['identifier']))
                data_acceptable_license_valid.append(dataset)

        if metadata['acceptable_license']:
            os.link(
                json_file_name, '{}/json_acceptable_license/{}.json'.format(
                    args.data_dir, dataset['identifier']))
            data_acceptable_license.append(dataset)
Esempio n. 23
0
    def download(self, **state):

        # Load the status file.
        split = urlsplit(self.url)
        url_status_file = '{}.json'.format(
            os.path.join(state['archives_dir'], 'status', split.netloc,
                         split.path[1:]))
        url_status = get_status(url_status_file)

        # Get the headers for the URL
        if 'downloaded' not in url_status or not url_status['downloaded']:
            req = requests.get(self.url, allow_redirects=True, stream=True)
            headers = req.headers

            # Extract a filename
            filename = rfc6266.parse_requests_response(req).filename_unsafe

            # Work out our output path
            output_file = os.path.join(state['archives_dir'], filename)

            status_file = os.path.join(state['status_dir'],
                                       '{}.json'.format(filename))

            # Load the status file.
            status = get_status(status_file)

            # If our file exists compare the last modified of our file vs the one on the server
            if os.path.exists(output_file):

                # Check if we have a last modified value in our header
                if 'Last-Modified' in headers:

                    # Get when the local and remote files were last modified
                    l_modified = os.path.getmtime(output_file)
                    r_modified = time.mktime(
                        parser.parse(headers['Last-Modified']).timetuple())

                    # If we were modified after we don't need to download again
                    if l_modified > r_modified:
                        cprint(indent(
                            'URL {} not modified... Skipping...'.format(
                                filename), 8),
                               'yellow',
                               attrs=['bold'])

                        url_status = update_status(url_status_file, {
                            'downloaded': True,
                            'archive': output_file
                        })
                        return {'archive': output_file}

                # If there is an etag we can use we can check that hasn't changed
                elif 'Etag' in headers:
                    if 'download_etag' not in status or status[
                            'download_etag'] != headers['Etag']:
                        status = update_status(
                            status_file, {'download_etag': headers['Etag']})
                    else:
                        cprint(indent(
                            'URL {} not modified... Skipping...'.format(
                                filename), 8),
                               'yellow',
                               attrs=['bold'])

                        url_status = update_status(url_status_file, {
                            'downloaded': True,
                            'archive': output_file
                        })
                        return {'archive': output_file}

            cprint(indent('Downloading {}'.format(filename), 8),
                   'green',
                   attrs=['bold'])

            # Total size in bytes.
            total_size = int(headers.get('content-length', 0))

            # Get the file
            with open(output_file, 'wb') as f:
                with tqdm(total=total_size, unit='B',
                          unit_scale=True) as progress:
                    for data in req.iter_content(32 * 1024):
                        f.write(data)
                        progress.update(len(data))

            url_status = update_status(url_status_file, {
                'downloaded': True,
                'archive': output_file
            })

            # Return our updates to state
            return {'archive': output_file}

        else:
            cprint(indent(
                'URL {} not modified... Skipping...'.format(
                    os.path.basename(url_status['archive'])), 8),
                   'yellow',
                   attrs=['bold'])
            return {'archive': url_status['archive']}
Esempio n. 24
0
def download(id: int, file: int = typer.Argument(None)):
    """Download a dataset by passing in the dataset id and optionally the file number. """
    datasets = get_datasets(API_URL)

    # if dataset id exists
    if id in dataset_ids(datasets):
        # loop through each dataset
        for row in datasets:
            # find the dataset with given id
            if int(row["id"]) == id:

                # make directory in the format of id_DatasetName
                id = row["id"]
                name = row["Name"]
                dest_folder = f"{id}_{name.replace(' ','')}"
                if not os.path.exists(dest_folder):
                    os.makedirs(dest_folder)

                # if optional argument is passed to download specified file
                if not (file is None):
                    # making sure that the file passed exists for the specified dataset
                    file_index = (
                        file - 1
                    )  # converting user input into proper python indexing
                    if file_index > len(row["Datasets"]) or file_index < 0:
                        typer.echo(
                            "The dataset you selected does not have that file."
                        )
                    else:
                        # getting download URL
                        url = row["Datasets"][file_index]["URL"]
                        r = requests.get(url,
                                         allow_redirects=True,
                                         stream=True)

                        # getting individual file's name
                        filename = rfc6266.parse_requests_response(
                            r).filename_unsafe
                        file_path = os.path.join(dest_folder, filename)

                        typer.echo("Downloading file now!")
                        # loading bar code from stack overflow
                        # https://stackoverflow.com/questions/37573483/progress-bar-while-download-file-over-http-with-requests/37573701#37573701
                        total_size_in_bytes = int(
                            r.headers.get("content-length", 0))
                        block_size = 1024  # 1 Kibibyte
                        progress_bar = tqdm(total=total_size_in_bytes,
                                            unit="iB",
                                            unit_scale=True)

                        # saving file
                        with open(file_path, "wb") as fid:
                            for data in r.iter_content(block_size):
                                progress_bar.update(len(data))
                                fid.write(data)
                        progress_bar.close()
                # if no file is specified, download all files
                else:
                    urls = [data["URL"] for data in row["Datasets"]]
                    typer.echo("Downloading files now!")
                    for i, url in enumerate(urls):
                        r = requests.get(url,
                                         allow_redirects=True,
                                         stream=True)
                        filename = rfc6266.parse_requests_response(
                            r).filename_unsafe
                        file_path = os.path.join(dest_folder, filename)
                        total_size_in_bytes = int(
                            r.headers.get("content-length", 0))
                        block_size = 1024  # 1 Kibibyte
                        progress_bar = tqdm(total=total_size_in_bytes,
                                            unit="iB",
                                            unit_scale=True)
                        with open(file_path, "wb") as fid:
                            for data in r.iter_content(block_size):
                                progress_bar.update(len(data))
                                fid.write(data)
                        progress_bar.close()
    # if dataset id doesnt exist
    else:
        typer.echo(
            "That dataset doesn't exist or you've made a typo in the id.")
        typer.echo(
            "Use the 'see all datasets' command to view the available datasets."
        )
Esempio n. 25
0
def test_requests(httpserver):
    requests = pytest.importorskip('requests')
    httpserver.serve_content('eep', headers={
        'Content-Disposition': 'attachment; filename="a b="'})
    resp = requests.get(httpserver.url)
    assert parse_requests_response(resp).filename_unsafe == 'a b='
Esempio n. 26
0
def test_requests(httpserver):
    requests = pytest.importorskip('requests')
    httpserver.serve_content(
        'eep', headers={'Content-Disposition': 'attachment; filename="a b="'})
    resp = requests.get(httpserver.url)
    assert parse_requests_response(resp).filename_unsafe == 'a b='