def get_links(content, session): links = [] if '<a' in content: for link in pq(content)('a'): if 'href' not in link.attrib: continue href = link.attrib['href'] if href.startswith('/'): href = SITE_URL + href if not href.startswith('http'): continue if href in links: continue filename = href.rpartition('/')[2] if filename == '' or filename.endswith( '.html') or filename.endswith('.aspx'): continue s3_object_name = 'government_decisions/' + filename if not object_storage.exists(s3_object_name): try: conn = session.get(href) if not conn.status_code == requests.codes.ok: continue href = object_storage.write(s3_object_name, data=conn.content, public_bucket=True, create_bucket=True) except: continue else: href = object_storage.urlfor(s3_object_name) links.append(dict(href=href, title=pq(link).text())) return links
def process_row(row, *_): s3_object_name = row['s3_object_name'] url = row['url'] conn = session.get(url) time.sleep(3) if not conn.status_code == requests.codes.ok: return None charset = get_charset(conn) conn.encode = charset object_storage.write(s3_object_name, data=conn.content, public_bucket=True, create_bucket=True, content_type="text/html; charset={}".format(charset)) return row
def write_to_object_storage(self, object_name, data): logging.error('write_to_object_storage %s', object_name) if not object_storage.exists(object_name): ret = object_storage.write(object_name, data=data, public_bucket=True, create_bucket=True) else: ret = object_storage.urlfor(object_name) return ret
obj_name = os.path.join('spending-reports', obj_name) if not object_storage.exists(obj_name): tmp = tempfile.NamedTemporaryFile() try: stream = requests.get(url_to_use, stream=True, verify=False).raw except: logging.exception('Failed to load data from %s', url_to_use) stream.read = functools.partial(stream.read, decode_content=True) shutil.copyfileobj(stream, tmp) tmp.flush() url_to_use = object_storage.write(obj_name, file_name=tmp.name, create_bucket=False) tmp.close() del tmp else: url_to_use = object_storage.urlfor(obj_name) report['report-sheets'] = 0 report['report-headers-row'] = None report['report-rows'] = None report['report-bad-rows'] = None report['load-error'] = None with tempfile.NamedTemporaryFile( suffix=os.path.splitext(url_to_use)[1]) as tmp: if url_to_use.startswith('http'):