def get_murs(from_mur_no): bucket = get_bucket() bucket_name = env.get_credential('bucket') if from_mur_no is None: start_mur_serial = 0 else: start_mur_serial = int(MUR_NO_REGEX.match(from_mur_no).group('serial')) with db.engine.connect() as conn: rs = conn.execute(ALL_MURS, start_mur_serial) for row in rs: case_id = row['case_id'] sort1, sort2 = get_sort_fields(row['case_no']) mur = { 'doc_id': 'mur_%s' % row['case_no'], 'no': row['case_no'], 'name': row['name'], 'mur_type': 'current', 'sort1': sort1, 'sort2': sort2, } mur['subjects'] = get_subjects(case_id) mur['election_cycles'] = get_election_cycles(case_id) participants = get_participants(case_id) mur['participants'] = list(participants.values()) mur['respondents'] = get_sorted_respondents(mur['participants']) mur['commission_votes'] = get_commission_votes(case_id) mur['dispositions'] = get_dispositions(case_id) mur['documents'] = get_documents(case_id, bucket, bucket_name) mur['open_date'], mur['close_date'] = get_open_and_close_dates( case_id) mur['url'] = '/legal/matter-under-review/%s/' % row['case_no'] yield mur
def add_caching_headers(response): max_age = env.get_credential('FEC_CACHE_AGE') cache_all_requests = env.get_credential('CACHE_ALL_REQUESTS', False) status_code = response.status_code if max_age is not None: response.headers.add('Cache-Control', 'public, max-age={}'.format(max_age)) if (cache_all_requests and status_code == 200): try: # convert the results to JSON json_data = utils.get_json_data(response) # format the URL by removing the api_key and special characters formatted_url = utils.format_url(request.url) # get s3 bucket env variables s3_bucket = utils.get_bucket() cached_url = "s3://{0}/cached-calls/{1}.json".format(s3_bucket.name, formatted_url) s3_key = utils.get_s3_key(cached_url) # upload the request_content.json file to s3 bucket with smart_open(s3_key, 'wb') as cached_file: cached_file.write(json_data) logger.info('The following request has been cached and uploaded successfully :%s ', cached_url) except: logger.error('Cache Upload failed') return response
def load_advisory_opinions_into_s3(): if legal_loaded(): docs_in_db = set([str(r[0]) for r in db.engine.execute( "select document_id from document").fetchall()]) bucket = get_bucket() docs_in_s3 = set([re.match("legal/aos/([0-9]+)\.pdf", obj.key).group(1) for obj in bucket.objects.filter(Prefix="legal/aos")]) new_docs = docs_in_db.difference(docs_in_s3) if new_docs: query = "select document_id, fileimage from document \ where document_id in (%s)" % ','.join(new_docs) result = db.engine.connect().execution_options(stream_results=True)\ .execute(query) bucket_name = env.get_credential('bucket') for i, (document_id, fileimage) in enumerate(result): key = "legal/aos/%s.pdf" % document_id bucket.put_object(Key=key, Body=bytes(fileimage), ContentType='application/pdf', ACL='public-read') url = "https://%s.s3.amazonaws.com/%s" % (bucket_name, key) print("pdf written to %s" % url) print("%d of %d advisory opinions written to s3" % (i + 1, len(new_docs))) else: print("No new advisory opinions found.")
def get_single_mur(mur_no): bucket = get_bucket() bucket_name = env.get_credential('bucket') with db.engine.connect() as conn: rs = conn.execute(SINGLE_MUR, mur_no) row = rs.fetchone() case_id = row['case_id'] sort1, sort2 = get_sort_fields(row['case_no']) mur = { 'doc_id': 'mur_%s' % row['case_no'], 'no': row['case_no'], 'name': row['name'], 'mur_type': 'current', 'sort1': sort1, 'sort2': sort2, } mur['subjects'] = get_subjects(case_id) mur['election_cycles'] = get_election_cycles(case_id) participants = get_participants(case_id) mur['participants'] = list(participants.values()) mur['respondents'] = get_sorted_respondents(mur['participants']) mur['commission_votes'] = get_commission_votes(case_id) mur['dispositions'] = get_dispositions(case_id) mur['documents'] = get_documents(case_id, bucket, bucket_name) mur['open_date'], mur['close_date'] = get_open_and_close_dates(case_id) mur['url'] = '/legal/matter-under-review/%s/' % row['case_no'] return mur
def delete_murs_from_s3(): """ Deletes all MUR documents from S3 """ bucket = get_bucket() for obj in bucket.objects.filter(Prefix="legal/murs"): obj.delete()
def load_current_murs(): es = get_elasticsearch_connection() bucket = get_bucket() bucket_name = env.get_credential('bucket') with db.engine.connect() as conn: rs = conn.execute(ALL_MURS) for row in rs: case_id = row['case_id'] mur = { 'doc_id': 'mur_%s' % row['case_no'], 'no': row['case_no'], 'name': row['name'], 'mur_type': 'current', } mur['subject'] = {"text": get_subjects(case_id)} participants = get_participants(case_id) mur['participants'] = list(participants.values()) mur['disposition'] = get_disposition(case_id) mur['text'], mur['documents'] = get_documents( case_id, bucket, bucket_name) mur['open_date'], mur['close_date'] = get_open_and_close_dates( case_id) mur['url'] = '/legal/matter-under-review/%s/' % row['case_no'] es.index('docs', 'murs', mur, id=mur['doc_id'])
def add_caching_headers(response): max_age = env.get_credential('FEC_CACHE_AGE') cache_all_requests = env.get_credential('CACHE_ALL_REQUESTS', False) status_code = response.status_code if max_age is not None: response.headers.add('Cache-Control', 'public, max-age={}'.format(max_age)) if (cache_all_requests and status_code == 200): try: # convert the results to JSON json_data = utils.get_json_data(response) # format the URL by removing the api_key and special characters formatted_url = utils.format_url(request.url) # get s3 bucket env variables s3_bucket = utils.get_bucket() cached_url = "s3://{0}/cached-calls/{1}.json".format( s3_bucket.name, formatted_url) s3_key = utils.get_s3_key(cached_url) # upload the request_content.json file to s3 bucket with smart_open(s3_key, 'wb') as cached_file: cached_file.write(json_data) logger.info( 'The following request has been cached and uploaded successfully :%s ', cached_url) except: logger.error('Cache Upload failed') return response
def clear_bucket(): permanent_dir = ( 'legal', 'bulk-downloads', ) for obj in task_utils.get_bucket().objects.all(): if not obj.key.startswith(permanent_dir): obj.delete()
def process_mur(mur): logger.info("processing mur %d of %d" % (mur[0], mur[1])) es = utils.get_elasticsearch_connection() bucket = get_bucket() bucket_name = env.get_credential('bucket') mur_names = get_mur_names() (mur_no_td, open_date_td, close_date_td, parties_td, subject_td, citations_td)\ = re.findall("<td[^>]*>(.*?)</td>", mur[2], re.S) mur_no = re.search("/disclosure_data/mur/([0-9_A-Z]+)\.pdf", mur_no_td).group(1) logger.info("processing mur %s" % mur_no) pdf_key = 'legal/murs/%s.pdf' % mur_no if [k for k in bucket.objects.filter(Prefix=pdf_key)]: logger.info('already processed %s' % pdf_key) return text, pdf_size, pdf_pages = process_mur_pdf(mur_no, pdf_key, bucket) pdf_url = generate_aws_s3_url(bucket_name, pdf_key) open_date, close_date = (None, None) if open_date_td: open_date = datetime.strptime(open_date_td, '%m/%d/%Y').isoformat() if close_date_td: close_date = datetime.strptime(close_date_td, '%m/%d/%Y').isoformat() parties = re.findall("(.*?)<br>", parties_td) complainants = [] respondents = [] for party in parties: match = re.match("\(([RC])\) - (.*)", party) name = match.group(2).strip().title() if match.group(1) == 'C': complainants.append(name) if match.group(1) == 'R': respondents.append(name) subject = get_subject_tree(subject_td) citations = get_citations(re.findall("(.*?)<br>", citations_td)) mur_digits = re.match("([0-9]+)", mur_no).group(1) name = mur_names[mur_digits] if mur_digits in mur_names else '' doc = { 'doc_id': 'mur_%s' % mur_no, 'no': mur_no, 'name': name, 'text': text, 'mur_type': 'archived', 'pdf_size': pdf_size, 'pdf_pages': pdf_pages, 'open_date': open_date, 'close_date': close_date, 'complainants': complainants, 'respondents': respondents, 'subject': subject, 'citations': citations, 'url': pdf_url } es.index(DOCS_INDEX, 'murs', doc, id=doc['doc_id'])
def delete_cached_calls_from_s3(): """ Deletes all files and folders under the cached-calls folder from S3. """ bucket = utils.get_bucket() for obj in bucket.objects.filter(Prefix='cached-calls/'): obj.delete() slack_message = 'Successfully deleted the contents of the `cached-calls` folder in {0} from S3'.format( env.space) web_utils.post_to_slack(slack_message, '#bots') logger.info(slack_message)
def process_mur(mur): es = utils.get_elasticsearch_connection() bucket = get_bucket() mur_names = get_mur_names() (mur_no_td, open_date_td, close_date_td, parties_td, subject_td, citations_td)\ = re.findall("<td[^>]*>(.*?)</td>", mur[2], re.S) mur_no = re.search("/disclosure_data/mur/([0-9_A-Z]+)\.pdf", mur_no_td).group(1) logger.info("Loading archived MUR %s: %s of %s", mur_no, mur[0] + 1, mur[1]) pdf_key = 'legal/murs/%s.pdf' % mur_no text, pdf_size, pdf_pages = process_mur_pdf(mur_no, pdf_key, bucket) pdf_url = '/files/' + pdf_key open_date, close_date = (None, None) if open_date_td: open_date = datetime.strptime(open_date_td, '%m/%d/%Y').isoformat() if close_date_td: close_date = datetime.strptime(close_date_td, '%m/%d/%Y').isoformat() parties = re.findall("(.*?)<br>", parties_td) complainants = [] respondents = [] for party in parties: match = re.match("\(([RC])\) - (.*)", party) name = match.group(2).strip().title() if match.group(1) == 'C': complainants.append(name) if match.group(1) == 'R': respondents.append(name) subject = get_subject_tree(subject_td) citations = get_citations(re.findall("(.*?)<br>", citations_td)) mur_digits = re.match("([0-9]+)", mur_no).group(1) name = mur_names[mur_digits] if mur_digits in mur_names else '' doc = { 'doc_id': 'mur_%s' % mur_no, 'no': mur_no, 'name': name, 'text': text, 'mur_type': 'archived', 'pdf_size': pdf_size, 'pdf_pages': pdf_pages, 'open_date': open_date, 'close_date': close_date, 'complainants': complainants, 'respondents': respondents, 'subject': subject, 'citations': citations, 'url': pdf_url } es.index(DOCS_INDEX, 'murs', doc, id=doc['doc_id'])
def load_archived_murs(): table_text = requests.get('http://www.fec.gov/MUR/MURData.do').text rows = re.findall("<tr [^>]*>(.*?)</tr>", table_text, re.S)[1:] bucket = get_bucket() murs_completed = set([re.match("legal/murs/([0-9_A-Z]+).pdf", o.key).group(1) for o in bucket.objects.filter(Prefix="legal/murs") if re.match("legal/murs/([0-9_A-Z]+).pdf", o.key)]) rows = [r for r in rows if re.search('/disclosure_data/mur/([0-9_A-Z]+)\.pdf', r, re.M).group(1) not in murs_completed] shuffle(rows) murs = zip(range(len(rows)), [len(rows)] * len(rows), rows) with Pool(processes=1, maxtasksperchild=1) as pool: pool.map(process_mur, murs, chunksize=1)
def process_murs(raw_mur_tr_element_list): es = utils.get_elasticsearch_connection() bucket = get_bucket() mur_names = get_mur_names() for index, raw_mur_tr_element in enumerate(raw_mur_tr_element_list): (mur_no_td, open_date_td, close_date_td, parties_td, subject_td, citations_td)\ = re.findall("<td[^>]*>(.*?)</td>", raw_mur_tr_element, re.S) mur_no = re.search("/disclosure_data/mur/([0-9]+)(?:_[A-H])*\.pdf", mur_no_td).group(1) logger.info("Loading archived MUR %s: %s of %s", mur_no, index + 1, len(raw_mur_tr_element_list)) open_date, close_date = (None, None) if open_date_td: open_date = datetime.strptime(open_date_td, '%m/%d/%Y').isoformat() if close_date_td: close_date = datetime.strptime(close_date_td, '%m/%d/%Y').isoformat() parties = re.findall("(.*?)<br>", parties_td) complainants = [] respondents = [] for party in parties: match = re.match("\(([RC])\) - (.*)", party) name = match.group(2).strip().title() if match.group(1) == 'C': complainants.append(name) if match.group(1) == 'R': respondents.append(name) mur_name = mur_names.get(mur_no, '') mur = { 'doc_id': 'mur_%s' % mur_no, 'no': mur_no, 'name': mur_name, 'mur_type': 'archived', 'open_date': open_date, 'close_date': close_date, 'complainants': complainants, 'respondents': respondents, 'url': '/legal/matter-under-review/{0}/'.format(mur_no) } mur['subject'] = get_subject_tree(subject_td) mur['citations'] = get_citations(re.findall("(.*?)<br>", citations_td)) mur['documents'] = get_documents(mur_no_td, bucket) es.index('archived_murs_index', 'murs', mur, id=mur['doc_id'])
def process_mur(mur): es = utils.get_elasticsearch_connection() bucket = get_bucket() mur_names = get_mur_names() (mur_no_td, open_date_td, close_date_td, parties_td, subject_td, citations_td)\ = re.findall("<td[^>]*>(.*?)</td>", mur[2], re.S) mur_no = re.search("/disclosure_data/mur/([0-9_A-Z]+)\.pdf", mur_no_td).group(1) logger.info("Loading archived MUR %s: %s of %s", mur_no, mur[0] + 1, mur[1]) pdf_key = 'legal/murs/%s.pdf' % mur_no text, pdf_size, pdf_pages = process_mur_pdf(mur_no, pdf_key, bucket) pdf_url = '/files/' + pdf_key open_date, close_date = (None, None) if open_date_td: open_date = datetime.strptime(open_date_td, '%m/%d/%Y').isoformat() if close_date_td: close_date = datetime.strptime(close_date_td, '%m/%d/%Y').isoformat() parties = re.findall("(.*?)<br>", parties_td) complainants = [] respondents = [] for party in parties: match = re.match("\(([RC])\) - (.*)", party) name = match.group(2).strip().title() if match.group(1) == 'C': complainants.append(name) if match.group(1) == 'R': respondents.append(name) subject = get_subject_tree(subject_td) citations = get_citations(re.findall("(.*?)<br>", citations_td)) mur_digits = re.match("([0-9]+)", mur_no).group(1) name = mur_names[mur_digits] if mur_digits in mur_names else '' doc = { 'doc_id': 'mur_%s' % mur_no, 'no': mur_no, 'name': name, 'text': text, 'mur_type': 'archived', 'pdf_size': pdf_size, 'pdf_pages': pdf_pages, 'open_date': open_date, 'close_date': close_date, 'complainants': complainants, 'respondents': respondents, 'subject': subject, 'citations': citations, 'url': pdf_url } es.index('archived_murs_index', 'murs', doc, id=doc['doc_id'])
def get_advisory_opinions(from_ao_no): bucket = get_bucket() ao_names = get_ao_names() ao_no_to_component_map = { a: tuple(map(int, a.split('-'))) for a in ao_names } citations = get_citations(ao_names) if from_ao_no is None: start_ao_year, start_ao_serial = 0, 0 else: start_ao_year, start_ao_serial = tuple(map(int, from_ao_no.split('-'))) with db.engine.connect() as conn: rs = conn.execute(ALL_AOS, (start_ao_year, start_ao_serial, start_ao_year)) for row in rs: ao_id = row["ao_id"] year, serial = ao_no_to_component_map[row["ao_no"]] ao = { "no": row["ao_no"], "name": row["name"], "summary": row["summary"], "request_date": row["req_date"], "issue_date": row["issue_date"], "is_pending": ao_stage_to_pending(row["stage"]), "status": ao_stage_to_status(row["ao_no"], row["stage"]), "ao_citations": citations[row["ao_no"]]["ao"], "aos_cited_by": citations[row["ao_no"]]["aos_cited_by"], "statutory_citations": citations[row["ao_no"]]["statutes"], "regulatory_citations": citations[row["ao_no"]]["regulations"], "sort1": -year, "sort2": -serial, } ao["documents"] = get_documents(ao_id, bucket) ( ao["requestor_names"], ao["requestor_types"], ao["commenter_names"], ao["representative_names"], ao["entities"], ) = get_entities(ao_id) yield ao
def cache_all_requests(json_str, formatted_url): try: cached_url = 'cached-calls/{}'.format(formatted_url) bucket = utils.get_bucket() bucket.put_object(Key=cached_url, Body=json_str, ContentType='application/json', Expires=get_cache_expiration()) logger.info( 'The following request has been uploaded to S3 successfully: {}'. format(cached_url)) except Exception as e: logger.error( 'An exception occured while uploading the cached request to S3: {}' .format(e))
def get_single_case(case_type, case_no): bucket = get_bucket() bucket_name = env.get_credential('bucket') with db.engine.connect() as conn: rs = conn.execute(SINGLE_CASE, case_type, case_no) row = rs.first() if row is not None: case_id = row['case_id'] sort1, sort2 = get_sort_fields(row['case_no']) case = { 'doc_id': '{0}_{1}'.format(case_type.lower(), row['case_no']), 'no': row['case_no'], 'name': row['name'], 'published_flg': row['published_flg'], 'sort1': sort1, 'sort2': sort2, } case['commission_votes'] = get_commission_votes(case_type, case_id) case['documents'] = get_documents(case_id, bucket, bucket_name) case['url'] = '/legal/{0}/{1}/'.format(get_full_name(case_type), row['case_no']) if case_type == 'AF': case = extend(case, get_af_specific_fields(case_id)) return case if case_type == 'MUR': case['mur_type'] = 'current' case['subjects'] = get_subjects(case_id) case['election_cycles'] = get_election_cycles(case_id) participants = get_participants(case_id) case['participants'] = list(participants.values()) case['respondents'] = get_sorted_respondents(case['participants']) case['dispositions'] = get_dispositions(case_id) case['open_date'], case['close_date'] = get_open_and_close_dates( case_id) return case else:
def get_advisory_opinions(from_ao_no): bucket = get_bucket() ao_names = get_ao_names() ao_no_to_component_map = {a: tuple(map(int, a.split('-'))) for a in ao_names} citations = get_citations(ao_names) if from_ao_no is None: start_ao_year, start_ao_serial = 0, 0 else: start_ao_year, start_ao_serial = tuple(map(int, from_ao_no.split('-'))) with db.engine.connect() as conn: rs = conn.execute(ALL_AOS, (start_ao_year, start_ao_serial, start_ao_year)) for row in rs: ao_id = row["ao_id"] year, serial = ao_no_to_component_map[row["ao_no"]] ao = { "no": row["ao_no"], "name": row["name"], "summary": row["summary"], "request_date": row["req_date"], "issue_date": row["issue_date"], "is_pending": ao_stage_to_pending(row["stage"]), "status": ao_stage_to_status(row["ao_no"], row["stage"]), "ao_citations": citations[row["ao_no"]]["ao"], "aos_cited_by": citations[row["ao_no"]]["aos_cited_by"], "statutory_citations": citations[row["ao_no"]]["statutes"], "regulatory_citations": citations[row["ao_no"]]["regulations"], "sort1": -year, "sort2": -serial, } ao["documents"] = get_documents(ao_id, bucket) (ao["requestor_names"], ao["requestor_types"], ao["commenter_names"], ao["representative_names"], ao["entities"]) = get_entities(ao_id) yield ao
def load_archived_murs(): """ Reads data for archived MURs from http://www.fec.gov/MUR, assembles a JSON document corresponding to the MUR and indexes this document in Elasticsearch in the index `docs_index` with a doc_type of `murs`. In addition, the MUR document is uploaded to an S3 bucket under the _directory_ `legal/murs/`. """ table_text = requests.get('http://www.fec.gov/MUR/MURData.do').text rows = re.findall("<tr [^>]*>(.*?)</tr>", table_text, re.S)[1:] bucket = get_bucket() murs_completed = set([ re.match("legal/murs/([0-9_A-Z]+).pdf", o.key).group(1) for o in bucket.objects.filter(Prefix="legal/murs") if re.match("legal/murs/([0-9_A-Z]+).pdf", o.key) ]) rows = [ r for r in rows if re.search('/disclosure_data/mur/([0-9_A-Z]+)\.pdf', r, re.M).group(1) not in murs_completed ] shuffle(rows) murs = zip(range(len(rows)), [len(rows)] * len(rows), rows) with Pool(processes=1, maxtasksperchild=1) as pool: pool.map(process_mur, murs, chunksize=1)
def handle_exception(exception): wrapped = ResponseException(str(exception), ErrorCode.INTERNAL_ERROR, type(exception)) logger.info( 'An API error occurred with the status code of {status} ({exception}).' .format(status=wrapped.status, exception=wrapped.wrappedException)) if is_retrievable_from_cache(wrapped.status, request.path): logger.info('Attempting to retrieving the cached request from S3...') # Retrieve the information needed to construct a URL for the S3 bucket # where the cached API responses live. formatted_url = utils.format_url(request.url) s3_bucket = utils.get_bucket() bucket_region = env.get_credential('region') cached_url = "http://s3-{0}.amazonaws.com/{1}/cached-calls/{2}".format( bucket_region, s3_bucket.name, formatted_url) # Attempt to retrieve the cached data from S3. cached_data = utils.get_cached_request(cached_url) # If the cached data was returned, we can return that to the client. # Otherwise, log the error and raise an API error. if cached_data is not None: logger.info('Successfully retrieved cached request from S3.') return cached_data else: logger.error( 'An error occured while retrieving the cached file from S3.') raise exceptions.ApiError( 'The requested URL could not be found.'.format(request.url), status_code=http.client.NOT_FOUND) else: raise exceptions.ApiError( 'The requested URL could not be found.'.format(request.url), status_code=http.client.NOT_FOUND)
def delete_advisory_opinions_from_s3(): for obj in get_bucket().objects.filter(Prefix="legal/aos"): obj.delete()
def upload_s3(key, body): task_utils.get_bucket().put_object(Key=key, Body=body)
def clear_bucket(): for obj in task_utils.get_bucket().objects.all(): if not obj.key.startswith('legal'): obj.delete()
def delete_murs_from_s3(): bucket = get_bucket() for obj in bucket.objects.filter(Prefix="legal/murs"): obj.delete()
def clear_bucket(): for key in task_utils.get_bucket().objects.all(): key.delete()