Python crockford_hashの例、regs_common.util.crockford_hash Pythonの例

コード例 #1

0

ファイルを表示

ファイル: document.py プロジェクト: sunlightlabs/regulations-scraper

def _v3_make_view(format):
    match = V3_FORMAT_PARSER.match(format).groupdict()
    view_data = {
        'object_id': crockford_hash(format),
        'url': format,
        'type': match['type']
    }
    return View(**view_data)

コード例 #2

0

ファイルを表示

ファイル: cftc_scrape_documents.py プロジェクト: sunlightlabs/regulations-scraper

def parse_sirt_docket(docket_record):
    # okay, this one requires loading a paginated version, then checking a box that says "show all" to get everything...
    # which is arduous and stupid because it's a yucky ASP app.

    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    initial = pq(opener.open(docket_record['url']).read())

    error_header = initial("h4")
    if len(error_header) and "sorry" in error_header.text().lower():
        raise ExtractionFailed("This URL doesn't work.")

    formdata = urllib.urlencode(
        (('__EVENTTARGET',
          'ctl00$cphContentMain$GenericWebUserControl$ShowAllCheckBox'),
         ('__EVENTARGUMENT', ''), ('__LASTFOCUS', ''),
         ('__VIEWSTATE', initial('#__VIEWSTATE').val()),
         ('__EVENTVALIDATION', initial('#__EVENTVALIDATION').val()),
         ('ctl00$masterScriptManager', ''),
         ('ctl00$cphContentMain$GenericWebUserControl$ShowAllCheckBox', 'on')))

    page = pq(opener.open(docket_record['url'], data=formdata).read())

    docket = dict(docket_record)

    details = dict([
        re.split(r"\s*:\s*", row.strip())
        for row in re.split(r"<br ?/?>",
                            page('h5.QueryTitle').html()) if row.strip()
    ])

    if 'details' not in docket:
        docket['details'] = {}

    if 'Filing Description' in details:
        docket['title'] = details['Filing Description']

    if 'Organization' in details:
        docket['details']['Organization Name'] = details['Organization']

    if 'Status' in details:
        docket['details']['Status'] = details['Status']

    docket['comments'] = []

    for link in page('.gradient-style tr td a').items():
        doc = {
            'url': urlparse.urljoin(docket_record['url'], link.attr('href')),
            'title': fix_spaces(link.text().strip()),
            'details': {},
        }
        doc['doctype'] = 'public_submission' if 'comment' in doc[
            'title'].lower() else 'other'
        doc['id'] = crockford_hash(doc['url'])

        docket['comments'].append(doc)

    return docket

コード例 #3

0

ファイルを表示

ファイル: cftc_scrape_documents.py プロジェクト: apendleton/regulations-scraper

def parse_sirt_docket(docket_record):
    # okay, this one requires loading a paginated version, then checking a box that says "show all" to get everything...
    # which is arduous and stupid because it's a yucky ASP app.

    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    initial = pq(opener.open(docket_record['url']).read())

    error_header = initial("h4")
    if len(error_header) and "sorry" in error_header.text().lower():
        raise ExtractionFailed("This URL doesn't work.")

    formdata = urllib.urlencode((
            ('__EVENTTARGET', 'ctl00$cphContentMain$GenericWebUserControl$ShowAllCheckBox'),
            ('__EVENTARGUMENT', ''),
            ('__LASTFOCUS', ''),
            ('__VIEWSTATE', initial('#__VIEWSTATE').val()),
            ('__EVENTVALIDATION', initial('#__EVENTVALIDATION').val()),
            ('ctl00$masterScriptManager', ''),
            ('ctl00$cphContentMain$GenericWebUserControl$ShowAllCheckBox', 'on')
        ))

    page = pq(opener.open(docket_record['url'], data=formdata).read())

    docket = dict(docket_record)

    details = dict([re.split(r"\s*:\s*", row.strip()) for row in re.split(r"<br ?/?>", page('h5.QueryTitle').html()) if row.strip()])

    if 'details' not in docket:
        docket['details'] = {}

    if 'Filing Description' in details:
        docket['title'] = details['Filing Description']

    if 'Organization' in details:
        docket['details']['Organization Name'] = details['Organization']

    if 'Status' in details:
        docket['details']['Status'] = details['Status']

    docket['comments'] = []

    for link in page('.gradient-style tr td a').items():
        doc = {
            'url': urlparse.urljoin(docket_record['url'], link.attr('href')),
            'title': fix_spaces(link.text().strip()),
            'details': {},
        }
        doc['doctype'] = 'public_submission' if 'comment' in doc['title'].lower() else 'other'
        doc['id'] = crockford_hash(doc['url'])

        docket['comments'].append(doc)

    return docket

コード例 #4

0

ファイルを表示

ファイル: sec_scrape_dockets.py プロジェクト: sunlightlabs/regulations-scraper

def get_spotlight_files():
    out = {}
    for spotlight_type, spot_url in (("Dodd-Frank Act", "http://www.sec.gov/spotlight/regreformcomments.shtml"), ("JOBS Act", "https://www.sec.gov/spotlight/jobsactcomments.shtml")):
        spot_file = urllib2.urlopen(spot_url).read()
        page = pq(etree.fromstring(spot_file, parser))

        for link in page('a[href*="comments/df"],a[href*="comments/other"],a[href*="comments/jobs"]').items():
            href = urlparse.urljoin(spot_url, link.attr('href'))
            dkt = {
                'url': href,
                'id': crockford_hash(href)[:5],
                'type': 'nonrulemaking',
                'subtype': spotlight_type
            }
            out[dkt['id']] = dkt
    return out

コード例 #5

0

ファイルを表示

def flatten_docket(in_docket):
    out_cmts = []
    docket = dict(in_docket)

    for group in in_docket['comment_groups']:
        for heading, listing in group['comments'].iteritems():
            for comment in listing:
                if heading == "Other":
                    comment['doctype'] = 'other'
                else:
                    comment['doctype'] = 'public_submission'
                    if 'Comments' in heading:
                        comment['subtype'] = 'comment'
                    elif 'Meetings' in heading:
                        comment['subtype'] = 'exparte'
                    else:
                        assert False, 'unrecognized header type'

                if 'File No.' in group['details']:
                    comment['file'] = group['details']['File No.']

                # assign an ID if there isn't one
                if 'id' not in comment and 'url' in comment and comment['url']:
                    id_matches = re.findall("/[a-z]?\d+-(\d+).[a-z]+$",
                                            comment['url'])
                    if id_matches:
                        comment['id'] = id_matches[-1]
                    else:
                        comment['id'] = crockford_hash(comment['url'])

                out_cmts.append(comment)

    titles = [
        group['title'] for group in in_docket['comment_groups']
        if 'title' in group
    ]
    if titles:
        docket['title'] = titles[0]

    del docket['comment_groups']
    docket['comments'] = out_cmts

    return docket

コード例 #6

0

ファイルを表示

ファイル: sec_scrape_documents.py プロジェクト: apendleton/regulations-scraper

def flatten_docket(in_docket):
    out_cmts = []
    docket = dict(in_docket)

    for group in in_docket['comment_groups']:
        for heading, listing in group['comments'].iteritems():
            for comment in listing:
                if heading == "Other":
                    comment['doctype'] = 'other'
                else:
                    comment['doctype'] = 'public_submission'
                    if 'Comments' in heading:
                        comment['subtype'] = 'comment'
                    elif 'Meetings' in heading:
                        comment['subtype'] = 'exparte'
                    else:
                        assert False, 'unrecognized header type'

                if 'File No.' in group['details']:
                    comment['file'] = group['details']['File No.']

                # assign an ID if there isn't one
                if 'id' not in comment and 'url' in comment and comment['url']:
                    id_matches = re.findall("/[a-z]?\d+-(\d+).[a-z]+$", comment['url'])
                    if id_matches:
                        comment['id'] = id_matches[-1]
                    else:
                        comment['id'] = crockford_hash(comment['url'])

                out_cmts.append(comment)

    titles = [group['title'] for group in in_docket['comment_groups'] if 'title' in group]
    if titles:
        docket['title'] = titles[0]

    del docket['comment_groups']
    docket['comments'] = out_cmts

    return docket

コード例 #7

0

ファイルを表示

def file_obj_from_url(url, existing_files=None):
    # current style
    matches = re.findall(r"CommentList.aspx\?id=(\d+)", url)
    if matches:
        return {
            'url': url,
            'id': matches[0],
            'strategy': 'current'
        }

    # SIRT
    matches = re.match(r".*sirt.aspx\?.*Topic=(?P<topic>[A-Za-z0-9]+).*&Key=(?P<key>\d+).*", url)
    if matches:
        gd = matches.groupdict()
        out = {
            'url': 'http://sirt.cftc.gov/sirt/sirt.aspx?Topic=%s&Key=%s' % (gd['topic'], gd['key']),
            'id': "SIRT-%s-%s" % (crockford_hash(gd['topic'])[:4], gd['key']),
            'strategy': 'sirt'
        }
        if existing_files:
            non_sirt = [f for f in existing_files if f['strategy'] != 'sirt']
            if non_sirt:
                out['parent'] = non_sirt[0]['id']
        return out
    elif 'sirt.cftc.gov' in url:
        # this is broken input, but there's nothing we can do about it
        return None

    # old style
    matches = re.findall(r"http://www.cftc.gov/LawRegulation/PublicComments/([A-Z0-9-]+)", url)
    if matches:
        return {
            'url': url,
            'id': "OS-%s" % matches[0],
            'strategy': 'old'
        }
    
    assert matches, "no ID found: %s" % url

コード例 #8

0

ファイルを表示

ファイル: document.py プロジェクト: sunlightlabs/regulations-scraper

def _v2v3_scrape_document(id, cpool=None):
    doc3 = json.load(_v3_get_document(id, cpool))

    if 'code' in doc3:
        raise DoesNotExist

    # pull out what used to be called 'details'
    details = {}
    special = {}
    detail_template = set(['label', 'value'])
    for key, contents in doc3.iteritems():
        if type(contents) is dict and set(contents.keys()) == detail_template:
            if key in DOC_DETAILS_SPECIAL:
                special[key] = contents['value']
            else:
                detail_name = DOC_DETAIL_NAMES.get(key, NON_LETTERS.sub('_', contents['label']))
                details[detail_name] = contents['value']

    # deal with submitter name
    if 'submitterName' in special:
        parsed = IndividualNameCleaver(special['submitterName']).parse()
        if parsed.first is not None:
            details['First_Name'] = parsed.first
        if parsed.last is not None:
            details['Last_Name'] = parsed.last
        if parsed.middle is not None:
            middle = NON_LETTERS.sub('', parsed.middle)
            details['Middle_Name' if len(middle) > 1 else 'Middle_Initial'] = parsed.middle

    # deal with date types
    for new_label, old_label in (('commentDueDate', 'Comment_Due_Date'), ('commentStartDate', 'Comment_Start_Date'), ('postedDate', 'Date_Posted'), ('receivedDate', 'Received_Date'), ('effectiveDate', 'Effective_Date'), ('postMarkDate', 'Post_Mark_Date')):
        if new_label in doc3 and doc3[new_label]:
            details[old_label] = dateutil.parser.parse(doc3[new_label])

    # a couple of special cases
    if 'status' in doc3:
        details['Status'] = doc3['status']
    
    out = {
        # basic metadata
        'id': special['documentId'],
        'title': unicode(special.get('title', '')),
        'agency': special.get('agencyAcronym', ''),
        'docket_id': special.get('docketId', ''),
        'type': INCONSISTENT_DOC_TYPES[special['documentType']],
        'topics': doc3.get('topics', []),
        'scraped': 'yes',
        'deleted': False,
        
        # details
        'details': details,
        
        # views
        'views': [_v3_make_view(format) for format in doc3['fileFormats']] if 'fileFormats' in doc3 and doc3['fileFormats'] else []
    }
    out['fr_doc'] = out['type'] in set(('rule', 'proposed_rule', 'notice'))

    if 'comment' in special and special['comment']:
        out['abstract'] = unicode(special['comment'])
        
        # fake a view containing the contents of the comment field if there aren't any views, to deal with a behavior change in the RDGv3 API
        if not out['views']:
            view_data = {
                'url': "http://api.data.gov/regulations/v3/document.json?documentId=%s" % out['id'],
                'type': "txt",
                'downloaded': 'yes',
                'extracted': 'yes'
            }
            view_data['object_id'] = crockford_hash(view_data['url'])
            out['views'] = [View(**view_data)]
            out['views'][0].write_on_save(out['abstract'].encode('utf8'))
            
    if out['views']:
        out['object_id'] = out['views'][0].object_id
    
    # conditional fields
    if 'commentOnDoc' in doc3 and doc3['commentOnDoc'] and \
        'documentId' in doc3['commentOnDoc'] and doc3['commentOnDoc']['documentId'] and \
        'documentType' in doc3['commentOnDoc'] and doc3['commentOnDoc']['documentType']:
        out['comment_on'] = {
            'agency': doc3['commentOnDoc']['documentId'].split('-')[0],
            'title': unicode(doc3['commentOnDoc']['title']),
            'type': INCONSISTENT_DOC_TYPES[doc3['commentOnDoc']['documentType']],
            'document_id': doc3['commentOnDoc']['documentId']
        }
        out['comment_on']['fr_doc'] =  out['comment_on']['type'] in set(('rule', 'proposed_rule', 'notice'))
    
    if 'attachments' in doc3 and doc3['attachments']:
        attachments = []
        for attachment in doc3['attachments']:
            attachment = Attachment(**{
                'title': unicode(attachment.get('title', '')),
                'abstract': unicode(attachment.get('abstract', '')),
                'views': [_v3_make_view(format) for format in attachment['fileFormats']] if 'fileFormats' in attachment and attachment['fileFormats'] else []
            })
            if attachment.views:
                attachment.object_id = attachment.views[0].object_id
            attachments.append(attachment)
        out['attachments'] = attachments
    
    if 'rin' in special and special['rin']:
        out['rin'] = special['rin']
    
    return Doc(**out)