def _v3_make_view(format): match = V3_FORMAT_PARSER.match(format).groupdict() view_data = { 'object_id': crockford_hash(format), 'url': format, 'type': match['type'] } return View(**view_data)
def parse_sirt_docket(docket_record): # okay, this one requires loading a paginated version, then checking a box that says "show all" to get everything... # which is arduous and stupid because it's a yucky ASP app. cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) initial = pq(opener.open(docket_record['url']).read()) error_header = initial("h4") if len(error_header) and "sorry" in error_header.text().lower(): raise ExtractionFailed("This URL doesn't work.") formdata = urllib.urlencode( (('__EVENTTARGET', 'ctl00$cphContentMain$GenericWebUserControl$ShowAllCheckBox'), ('__EVENTARGUMENT', ''), ('__LASTFOCUS', ''), ('__VIEWSTATE', initial('#__VIEWSTATE').val()), ('__EVENTVALIDATION', initial('#__EVENTVALIDATION').val()), ('ctl00$masterScriptManager', ''), ('ctl00$cphContentMain$GenericWebUserControl$ShowAllCheckBox', 'on'))) page = pq(opener.open(docket_record['url'], data=formdata).read()) docket = dict(docket_record) details = dict([ re.split(r"\s*:\s*", row.strip()) for row in re.split(r"<br ?/?>", page('h5.QueryTitle').html()) if row.strip() ]) if 'details' not in docket: docket['details'] = {} if 'Filing Description' in details: docket['title'] = details['Filing Description'] if 'Organization' in details: docket['details']['Organization Name'] = details['Organization'] if 'Status' in details: docket['details']['Status'] = details['Status'] docket['comments'] = [] for link in page('.gradient-style tr td a').items(): doc = { 'url': urlparse.urljoin(docket_record['url'], link.attr('href')), 'title': fix_spaces(link.text().strip()), 'details': {}, } doc['doctype'] = 'public_submission' if 'comment' in doc[ 'title'].lower() else 'other' doc['id'] = crockford_hash(doc['url']) docket['comments'].append(doc) return docket
def parse_sirt_docket(docket_record): # okay, this one requires loading a paginated version, then checking a box that says "show all" to get everything... # which is arduous and stupid because it's a yucky ASP app. cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) initial = pq(opener.open(docket_record['url']).read()) error_header = initial("h4") if len(error_header) and "sorry" in error_header.text().lower(): raise ExtractionFailed("This URL doesn't work.") formdata = urllib.urlencode(( ('__EVENTTARGET', 'ctl00$cphContentMain$GenericWebUserControl$ShowAllCheckBox'), ('__EVENTARGUMENT', ''), ('__LASTFOCUS', ''), ('__VIEWSTATE', initial('#__VIEWSTATE').val()), ('__EVENTVALIDATION', initial('#__EVENTVALIDATION').val()), ('ctl00$masterScriptManager', ''), ('ctl00$cphContentMain$GenericWebUserControl$ShowAllCheckBox', 'on') )) page = pq(opener.open(docket_record['url'], data=formdata).read()) docket = dict(docket_record) details = dict([re.split(r"\s*:\s*", row.strip()) for row in re.split(r"<br ?/?>", page('h5.QueryTitle').html()) if row.strip()]) if 'details' not in docket: docket['details'] = {} if 'Filing Description' in details: docket['title'] = details['Filing Description'] if 'Organization' in details: docket['details']['Organization Name'] = details['Organization'] if 'Status' in details: docket['details']['Status'] = details['Status'] docket['comments'] = [] for link in page('.gradient-style tr td a').items(): doc = { 'url': urlparse.urljoin(docket_record['url'], link.attr('href')), 'title': fix_spaces(link.text().strip()), 'details': {}, } doc['doctype'] = 'public_submission' if 'comment' in doc['title'].lower() else 'other' doc['id'] = crockford_hash(doc['url']) docket['comments'].append(doc) return docket
def get_spotlight_files(): out = {} for spotlight_type, spot_url in (("Dodd-Frank Act", "http://www.sec.gov/spotlight/regreformcomments.shtml"), ("JOBS Act", "https://www.sec.gov/spotlight/jobsactcomments.shtml")): spot_file = urllib2.urlopen(spot_url).read() page = pq(etree.fromstring(spot_file, parser)) for link in page('a[href*="comments/df"],a[href*="comments/other"],a[href*="comments/jobs"]').items(): href = urlparse.urljoin(spot_url, link.attr('href')) dkt = { 'url': href, 'id': crockford_hash(href)[:5], 'type': 'nonrulemaking', 'subtype': spotlight_type } out[dkt['id']] = dkt return out
def flatten_docket(in_docket): out_cmts = [] docket = dict(in_docket) for group in in_docket['comment_groups']: for heading, listing in group['comments'].iteritems(): for comment in listing: if heading == "Other": comment['doctype'] = 'other' else: comment['doctype'] = 'public_submission' if 'Comments' in heading: comment['subtype'] = 'comment' elif 'Meetings' in heading: comment['subtype'] = 'exparte' else: assert False, 'unrecognized header type' if 'File No.' in group['details']: comment['file'] = group['details']['File No.'] # assign an ID if there isn't one if 'id' not in comment and 'url' in comment and comment['url']: id_matches = re.findall("/[a-z]?\d+-(\d+).[a-z]+$", comment['url']) if id_matches: comment['id'] = id_matches[-1] else: comment['id'] = crockford_hash(comment['url']) out_cmts.append(comment) titles = [ group['title'] for group in in_docket['comment_groups'] if 'title' in group ] if titles: docket['title'] = titles[0] del docket['comment_groups'] docket['comments'] = out_cmts return docket
def flatten_docket(in_docket): out_cmts = [] docket = dict(in_docket) for group in in_docket['comment_groups']: for heading, listing in group['comments'].iteritems(): for comment in listing: if heading == "Other": comment['doctype'] = 'other' else: comment['doctype'] = 'public_submission' if 'Comments' in heading: comment['subtype'] = 'comment' elif 'Meetings' in heading: comment['subtype'] = 'exparte' else: assert False, 'unrecognized header type' if 'File No.' in group['details']: comment['file'] = group['details']['File No.'] # assign an ID if there isn't one if 'id' not in comment and 'url' in comment and comment['url']: id_matches = re.findall("/[a-z]?\d+-(\d+).[a-z]+$", comment['url']) if id_matches: comment['id'] = id_matches[-1] else: comment['id'] = crockford_hash(comment['url']) out_cmts.append(comment) titles = [group['title'] for group in in_docket['comment_groups'] if 'title' in group] if titles: docket['title'] = titles[0] del docket['comment_groups'] docket['comments'] = out_cmts return docket
def file_obj_from_url(url, existing_files=None): # current style matches = re.findall(r"CommentList.aspx\?id=(\d+)", url) if matches: return { 'url': url, 'id': matches[0], 'strategy': 'current' } # SIRT matches = re.match(r".*sirt.aspx\?.*Topic=(?P<topic>[A-Za-z0-9]+).*&Key=(?P<key>\d+).*", url) if matches: gd = matches.groupdict() out = { 'url': 'http://sirt.cftc.gov/sirt/sirt.aspx?Topic=%s&Key=%s' % (gd['topic'], gd['key']), 'id': "SIRT-%s-%s" % (crockford_hash(gd['topic'])[:4], gd['key']), 'strategy': 'sirt' } if existing_files: non_sirt = [f for f in existing_files if f['strategy'] != 'sirt'] if non_sirt: out['parent'] = non_sirt[0]['id'] return out elif 'sirt.cftc.gov' in url: # this is broken input, but there's nothing we can do about it return None # old style matches = re.findall(r"http://www.cftc.gov/LawRegulation/PublicComments/([A-Z0-9-]+)", url) if matches: return { 'url': url, 'id': "OS-%s" % matches[0], 'strategy': 'old' } assert matches, "no ID found: %s" % url
def _v2v3_scrape_document(id, cpool=None): doc3 = json.load(_v3_get_document(id, cpool)) if 'code' in doc3: raise DoesNotExist # pull out what used to be called 'details' details = {} special = {} detail_template = set(['label', 'value']) for key, contents in doc3.iteritems(): if type(contents) is dict and set(contents.keys()) == detail_template: if key in DOC_DETAILS_SPECIAL: special[key] = contents['value'] else: detail_name = DOC_DETAIL_NAMES.get(key, NON_LETTERS.sub('_', contents['label'])) details[detail_name] = contents['value'] # deal with submitter name if 'submitterName' in special: parsed = IndividualNameCleaver(special['submitterName']).parse() if parsed.first is not None: details['First_Name'] = parsed.first if parsed.last is not None: details['Last_Name'] = parsed.last if parsed.middle is not None: middle = NON_LETTERS.sub('', parsed.middle) details['Middle_Name' if len(middle) > 1 else 'Middle_Initial'] = parsed.middle # deal with date types for new_label, old_label in (('commentDueDate', 'Comment_Due_Date'), ('commentStartDate', 'Comment_Start_Date'), ('postedDate', 'Date_Posted'), ('receivedDate', 'Received_Date'), ('effectiveDate', 'Effective_Date'), ('postMarkDate', 'Post_Mark_Date')): if new_label in doc3 and doc3[new_label]: details[old_label] = dateutil.parser.parse(doc3[new_label]) # a couple of special cases if 'status' in doc3: details['Status'] = doc3['status'] out = { # basic metadata 'id': special['documentId'], 'title': unicode(special.get('title', '')), 'agency': special.get('agencyAcronym', ''), 'docket_id': special.get('docketId', ''), 'type': INCONSISTENT_DOC_TYPES[special['documentType']], 'topics': doc3.get('topics', []), 'scraped': 'yes', 'deleted': False, # details 'details': details, # views 'views': [_v3_make_view(format) for format in doc3['fileFormats']] if 'fileFormats' in doc3 and doc3['fileFormats'] else [] } out['fr_doc'] = out['type'] in set(('rule', 'proposed_rule', 'notice')) if 'comment' in special and special['comment']: out['abstract'] = unicode(special['comment']) # fake a view containing the contents of the comment field if there aren't any views, to deal with a behavior change in the RDGv3 API if not out['views']: view_data = { 'url': "http://api.data.gov/regulations/v3/document.json?documentId=%s" % out['id'], 'type': "txt", 'downloaded': 'yes', 'extracted': 'yes' } view_data['object_id'] = crockford_hash(view_data['url']) out['views'] = [View(**view_data)] out['views'][0].write_on_save(out['abstract'].encode('utf8')) if out['views']: out['object_id'] = out['views'][0].object_id # conditional fields if 'commentOnDoc' in doc3 and doc3['commentOnDoc'] and \ 'documentId' in doc3['commentOnDoc'] and doc3['commentOnDoc']['documentId'] and \ 'documentType' in doc3['commentOnDoc'] and doc3['commentOnDoc']['documentType']: out['comment_on'] = { 'agency': doc3['commentOnDoc']['documentId'].split('-')[0], 'title': unicode(doc3['commentOnDoc']['title']), 'type': INCONSISTENT_DOC_TYPES[doc3['commentOnDoc']['documentType']], 'document_id': doc3['commentOnDoc']['documentId'] } out['comment_on']['fr_doc'] = out['comment_on']['type'] in set(('rule', 'proposed_rule', 'notice')) if 'attachments' in doc3 and doc3['attachments']: attachments = [] for attachment in doc3['attachments']: attachment = Attachment(**{ 'title': unicode(attachment.get('title', '')), 'abstract': unicode(attachment.get('abstract', '')), 'views': [_v3_make_view(format) for format in attachment['fileFormats']] if 'fileFormats' in attachment and attachment['fileFormats'] else [] }) if attachment.views: attachment.object_id = attachment.views[0].object_id attachments.append(attachment) out['attachments'] = attachments if 'rin' in special and special['rin']: out['rin'] = special['rin'] return Doc(**out)