def fetch(self, url, pairtree=False, meta='short'): ''' fetch url and store as file pairtree = False, the path to filename is sha256 hashsum from url, no subdirectories pairtree = True, the path to filename is the result from sha256 checksum from url meta = short, returns url, url.sha256, file.path, status.code meta = details, returns url, url.sha256, url.pairtree, file.path, file.sha256, file.pairtree, status.code ''' filepath = utils.pairtree(utils.sha256str(url)) if pairtree else utils.sha256str(url) filepath = os.path.join(self._path, filepath) response = requests.get(url=url, headers={'User-Agent': self._user_agent}, stream=True) sha256url = utils.sha256str(url) result = { 'url': url, 'url.sha256': sha256url, 'status.code': None, } if response.status_code == 200: fo = FileObject(filepath) fo.write(response.raw) filemeta = fo.meta result['file.path'] = filepath if meta == 'detail': result['file.sha256'] = filemeta['content.sha256'] result['file.pairtree'] = filemeta['content.pairtree'] if meta == 'detail': result['url.pairtree'] = utils.pairtree(sha256url) result['status.code'] = response.status_code return result
def fetch(ctx, **opts): """ fetch operations """ if not opts['url'] and not opts['urls']: show_help(ctx) import fetch metadata_index = dict() if opts['index']: try: for rec in opts['index'].readlines(): rec = json.loads(rec) if rec[u'status.code'] != 200: continue sha256url = rec.pop('url.sha256') metadata_index[sha256url] = rec except ValueError: pass if opts['url']: urls = opts['url'] elif opts['urls']: urls = opts['urls'].readlines() urls = [u.strip() for u in urls \ if utils.sha256str(u.strip()) not in metadata_index.keys() and u.strip()] fetcher = fetch.Fetcher(opts['path'], user_agent=opts['user_agent']) for url in urls: res = fetcher.fetch(url.strip(), pairtree=opts['pairtree'], meta=opts['meta']) json_res = json.dumps(res) print json_res if opts['index']: opts['index'].write("%s\n" % json_res) sha256url = res.pop('url.sha256') metadata_index[sha256url] = res