Ejemplo n.º 1
0
    def fetch(self, url, pairtree=False, meta='short'):
        ''' fetch url and store as file

        pairtree = False, the path to filename is sha256 hashsum from url, no subdirectories
        pairtree = True, the path to filename is the result from sha256 checksum from url

        meta = short, returns url, url.sha256, file.path, status.code
        meta = details, returns url, url.sha256, url.pairtree, file.path, file.sha256, file.pairtree, status.code
        '''
        filepath = utils.pairtree(utils.sha256str(url)) if pairtree else utils.sha256str(url)
        filepath = os.path.join(self._path, filepath)

        response = requests.get(url=url, headers={'User-Agent': self._user_agent}, stream=True)

        sha256url = utils.sha256str(url)

        result = { 'url': url, 'url.sha256': sha256url, 'status.code': None, }

        if response.status_code == 200:
            fo = FileObject(filepath)
            fo.write(response.raw)

            filemeta = fo.meta
            result['file.path'] = filepath
            if meta == 'detail':
                result['file.sha256'] = filemeta['content.sha256']
                result['file.pairtree'] = filemeta['content.pairtree']

        if meta == 'detail':
            result['url.pairtree'] = utils.pairtree(sha256url)
        result['status.code'] = response.status_code

        return result
Ejemplo n.º 2
0
def fetch(ctx, **opts):
    """ fetch operations
    """
    if not opts['url'] and not opts['urls']:
        show_help(ctx)

    import fetch

    metadata_index = dict()
    if opts['index']:
        try:
            for rec in opts['index'].readlines():
                rec = json.loads(rec)
                if rec[u'status.code'] != 200:
                    continue
                sha256url = rec.pop('url.sha256')
                metadata_index[sha256url] = rec
        except ValueError:
            pass

    if opts['url']:
        urls = opts['url']
    elif opts['urls']:
        urls = opts['urls'].readlines()

    urls = [u.strip() for u in urls \
            if utils.sha256str(u.strip()) not in metadata_index.keys() and u.strip()]

    fetcher = fetch.Fetcher(opts['path'], user_agent=opts['user_agent'])
    for url in urls:
        res = fetcher.fetch(url.strip(), pairtree=opts['pairtree'], meta=opts['meta'])
        json_res = json.dumps(res)
        print json_res
        if opts['index']:
            opts['index'].write("%s\n" % json_res)
        sha256url = res.pop('url.sha256')
        metadata_index[sha256url] = res