def run():
    from regs_models import Entity

    print "Updating entity search index..."

    # mark the ones that should be searchable but aren't as searchable
    Entity.objects(__raw__={
        'td_type': 'organization',
        'stats.count': {
            '$gt': 0
        },
        'searchable': False
    }).update(set__searchable=True, multi=True)

    # mark the ones that are searchable but shouldn't be unsearchable
    Entity.objects(
        __raw__={
            '$or': [{
                'td_type': {
                    '$ne': 'organization'
                }
            }, {
                'stats.count': {
                    '$not': {
                        '$gt': 0
                    }
                }
            }],
            'searchable':
            True
        }).update(set__searchable=False, multi=True)

    print "Update complete."
Beispiel #2
0
    def get(self, request, entity_id, docket_id, document_type, entity_type):
        dkt_results = list(Docket.objects(id=docket_id).only('id', 'title'))
        ent_results = list(Entity.objects(id=entity_id).only('id', 'aliases'))
        if not dkt_results or not ent_results:
            raise Http404('Not found.')

        docket = dkt_results[0]
        entity = ent_results[0]

        if document_type == 'mentions':
            docs_q = Doc.objects(Q(attachments__views__entities=entity_id) | Q(views__entities=entity_id), docket_id=docket_id)
        else:
            docs_q = Doc.objects(submitter_entities=entity_id, docket_id=docket_id) \

        docs_q = docs_q.only('type', 'title', 'id', 'views', 'attachments.views', 'details.Date_Posted', 'deleted').hint([("docket_id", 1)])
        docs = filter(lambda d: not d.deleted, sorted(list(docs_q), key=lambda doc: doc.details.get('Date_Posted', datetime.datetime(1900,1,1)), reverse=True))

        get_views = lambda doc: [{
            'object_id': view.object_id,
            'file_type': view.type,
            'url': view.url.replace('inline', 'attachment')
        } for view in doc.views if entity_id in view.entities]

        out_docs = []
        for doc in docs[:10]:
            out_doc = {
                'title': doc.title,
                'id': doc.id,
                'date_posted': doc.details['Date_Posted'],
                'type': doc.type,
                'url': '/document/' + doc.id
            }
            if document_type == 'mentions':
                out_doc['files'] = get_views(doc) + list(itertools.chain.from_iterable([get_views(attachment) for attachment in doc.attachments]))

            out_docs.append(out_doc)

        return Response({
            'documents': out_docs,
            'has_more': len(docs) > 10,
            'count': len(docs),
            'document_search_url': "/search-document/" + \
                url_quote(":".join(["mentioned" if document_type == "mentions" else "submitter", entity.id, '"%s"' % entity.aliases[0]])) + \
                url_quote(":".join(["docket", docket.id, '"%s"' % docket.title])),
            'docket': {
                'id': docket.id,
                'title': docket.title,
            },
            'entity': {
                'id': entity.id,
                'name': entity.aliases[0]
            },
            'filter_type': document_type
        })
def run():
    from regs_models import Entity

    print "Updating entity search index..."

    # mark the ones that should be searchable but aren't as searchable
    Entity.objects(__raw__={
        'td_type': 'organization',
        'stats.count': {'$gt': 0},
        'searchable': False
    }).update(set__searchable=True, multi=True)

    # mark the ones that are searchable but shouldn't be unsearchable
    Entity.objects(__raw__={
        '$or': [
            {'td_type': {'$ne': 'organization'}},
            {'stats.count': {'$not': {'$gt': 0}}}
        ],
        'searchable': True
    }).update(set__searchable=False, multi=True)

    print "Update complete."
Beispiel #4
0
 def get(self, request):
     entities = Entity.objects(
         __raw__={
             'td_type':
             'organization',
             '$or': [{
                 'stats.submitter_mentions.count': {
                     '$gte': 1
                 }
             }, {
                 'stats.text_mentions.count': {
                     '$gte': 1
                 }
             }]
         }).only('id')
     return Response({'entities': [e.id for e in entities]})
Beispiel #5
0
 def get(self, request):
     entities = Entity.objects(__raw__={'td_type': 'organization', '$or':[{'stats.submitter_mentions.count':{'$gte':1}}, {'stats.text_mentions.count':{'$gte':1}}]}).only('id')
     return Response({'entities': [e.id for e in entities]})
Beispiel #6
0
    def get(self, request, *args, **kwargs):
        "Access aggregate information about entities as they occur in regulations.gov data."
        results = Entity.objects(id=kwargs['entity_id'])
        if not results:
            raise Http404('Docket not found.')

        entity = results[0]

        # basic docket metadata
        out = {
            'name': entity.aliases[0],
            'url': reverse('entity-view', args=args, kwargs=kwargs),
            'id': entity.id,
            'type': entity.td_type,
            'stats': entity.stats
        }

        stats = entity.stats
        if stats:
            # cleanup, plus stitch on some additional data
            now = datetime.datetime.now().date()
            for mention_type in ["text_mentions", "submitter_mentions"]:
                stats[mention_type].update({
                    'months': [month for month in prettify_months(stats[mention_type]['months']) if month['date_range'][0] <= now] if stats[mention_type]['months'] else [],
                })

                # limit ourselves to the top ten of each match type, and grab their extra metadata
                agencies = sorted(stats[mention_type]['agencies'].items(), key=lambda x: x[1], reverse=True)[:10]

                stats[mention_type]['top_agencies'] = [{
                    'id': item[0],
                    'count': item[1],
                    'months': prettify_months(stats[mention_type]['agencies_by_month'][item[0]])
                } for item in agencies]
                del stats[mention_type]['agencies'], stats[mention_type]['agencies_by_month']

                docket_list = stats[mention_type]['dockets'].items()
                years = request.GET.get('years', None)
                if years:
                    year_set = set(years.split(","))
                    docket_list = [item for item in docket_list if get_docket_year(item[0]) in year_set]
                dockets = sorted(docket_list, key=lambda x: x[1], reverse=True)[:10]

                stats[mention_type]['top_dockets'] = [{
                    'id': item[0],
                    'count': item[1]
                } for item in dockets]

                stats[mention_type]['docket_count'] = len(docket_list)
                del stats[mention_type]['dockets']

                stats[mention_type]['docket_search_url'] = "/search-docket/" + url_quote(":".join(["mentioned" if mention_type == "text_mentions" else "submitter", entity.id, '"%s"' % entity.aliases[0]]))

            # grab additional docket metadata
            ids = list(set([record['id'] for record in stats['submitter_mentions']['top_dockets']] + [record['id'] for record in stats['text_mentions']['top_dockets']]))
            dockets_search = Docket.objects(id__in=ids).only('id', 'title', 'year', 'details.dk_type', 'agency', 'stats.date_range')
            dockets = dict([(docket.id, docket) for docket in dockets_search])

            # stitch this back onto the main records
            for mention_type in ['text_mentions', 'submitter_mentions']:
                for docket in stats[mention_type]['top_dockets']:
                    rdocket = dockets[docket['id']]
                    docket.update({
                        'title': rdocket.title,
                        'url': reverse('docket-view', kwargs={'docket_id': rdocket.id}),
                        'year': rdocket.year if rdocket.year else (getattr(rdocket.stats['date_range'][0], 'year', None) if 'date_range' in rdocket.stats else None),
                        'rulemaking': rdocket.details.get('Type', 'Nonrulemaking').lower() == 'rulemaking',
                        'agency': rdocket.agency if rdocket.agency else re.split("[-_]", rdocket.id)[0]
                    })

            # repeat for agencies
            ids = list(set([record['id'] for record in stats['submitter_mentions']['top_agencies']] + [record['id'] for record in stats['text_mentions']['top_agencies']]))
            agencies_search = Agency.objects(id__in=ids).only('id', 'name')
            agencies = dict([(agency.id, agency) for agency in agencies_search])

            # ...and stitch
            for mention_type in ['text_mentions', 'submitter_mentions']:
                for agency in stats[mention_type]['top_agencies']:
                    ragency = agencies.get(agency['id'], None)
                    agency.update({
                        'name': ragency.name if ragency else agency['id'],
                        'url': '/agency/%s' % agency['id']
                    })

            # and for comments
            recent_comments = []
            if 'recent_comments' in stats['submitter_mentions']:
                recent_comments_search = Doc.objects(id__in=[doc['id'] for doc in stats['submitter_mentions']['recent_comments']]).only('id', 'title', 'details')
                for comment in recent_comments_search:
                    comment_item = {
                        'title': comment.title,
                        'date': comment.details['Date_Posted'].date().isoformat() if 'Date_Posted' in comment.details else None,
                        'author': " ".join([comment.details.get('First_Name', ''), comment.details.get('Last_Name', '')]).strip(),
                        'organization': comment.details.get('Organization_Name', ''),
                        'url': '/document/' + comment.id
                    }
                    comment_item['author'] = comment_item['author'] if comment_item['author'] else None
                    recent_comments.append(comment_item)

            stats['submitter_mentions']['recent_comments'] = recent_comments

            out['stats'] = stats
        else:
            out['stats'] = {'count': 0}

        return Response(out)
Beispiel #7
0
    def get(self, request, *args, **kwargs):
        "Access basic metadata about regulations.gov dockets."

        results = list(self.aggregation_class.objects(id=kwargs[self.aggregation_field]))
        if not results:
            raise Http404('%s not found.' % self.aggregation_level.title())

        self.item = item = results[0]

        # basic docket metadata
        out = {
            'url': reverse('%s-view' % self.aggregation_level, kwargs=kwargs),
            'id': item.id,
            'type': self.aggregation_level
        }
        for label in ['name', 'title', 'year']:
            if hasattr(item, label):
                out[label] = getattr(item, label)
        rulemaking_field = getattr(item, 'details', {}).get('Type', None)
        if rulemaking_field:
            out['rulemaking'] = rulemaking_field.lower() == 'rulemaking'

        if 'replaced_by' in getattr(item, 'suppression', {}):
            new_kwargs = dict(kwargs)
            new_kwargs[self.aggregation_field] = item.suppression['replaced_by'][0]
            out['redirect_to'] = reverse('%s-view' % self.aggregation_level, kwargs=new_kwargs)

        stats = item.stats
        if stats:
            # cleanup, plus stitch on some additional data
            stats["type_breakdown"] = dict([(doc_type, stats["type_breakdown"].get(doc_type, 0)) for doc_type in Doc.type.choices])

            if 'weeks' in stats and len(stats['weeks']) != 0:
                now = datetime.datetime.now().date()
                stats['weeks'] = [week for week in prettify_weeks(stats['weeks']) if week['date_range'][0] <= now]


            if 'months' in stats and len(stats['months']) != 0:
                now = datetime.datetime.now().date()
                stats['months'] = [month for month in prettify_months(stats['months']) if month['date_range'][0] <= now]

            # limit ourselves to the top five of each match type, and grab their extra metadata
            for label, items in [('top_text_entities', stats['text_entities'].items()), ('top_submitter_entities', stats['submitter_entities'].items())]:
                stats[label] = [{
                    'id': i[0],
                    'count': i[1]
                } for i in sorted(items, key=lambda x: x[1], reverse=True)[:10]]
                stats[label[4:] + "_count"] = len(items)
            del stats['text_entities'], stats['submitter_entities']

            # grab additional info about these ones from the database
            ids = list(set([record['id'] for record in stats['top_text_entities']] + [record['id'] for record in stats['top_submitter_entities']]))
            entities_search = Entity.objects(id__in=ids).only('id', 'td_type', 'aliases')
            entities = dict([(entity.id, entity) for entity in entities_search])

            # stitch this back onto the main records
            for label in ['top_text_entities', 'top_submitter_entities']:
                filtered_entities = []
                for entity in stats[label]:
                    if not entities[entity['id']].td_type or entities[entity['id']].td_type != 'organization':
                        continue
                    
                    entity['type'] = entities[entity['id']].td_type
                    entity['name'] = entities[entity['id']].aliases[0]
                    entity['url'] = '/%s/%s/%s' % (entity['type'], slugify(entity['name']), entity['id'])
                    filtered_entities.append(entity)
                stats[label] = filtered_entities[:5]

            out['stats'] = stats
        else:
            out['stats'] = {'count': 0, 'type_breakdown': dict([(doc_type, 0) for doc_type in Doc.type.choices])}

        return Response(out)
Beispiel #8
0
    def get(self, request, *args, **kwargs):
        "Access basic metadata about regulations.gov documents."
        results = list(Doc.objects(id=kwargs['document_id']))
        if not results or results[0].deleted:
            raise Http404('Document not found.')

        document = results[0]

        # basic document metadata
        out = {
            'title': document.title,
            'url': reverse('document-view', kwargs=kwargs),
            'id': document.id,

            'agency': {
                'id': document.agency,
                'url': reverse('agency-view', kwargs={'agency': document.agency}),
                'name': Agency.objects(id=document.agency).only("name")[0].name
            },
            'date': document.details.get('Date_Posted', None),
            'type': document.type,
            'views': [],
            'attachments': [],
            'details': document.details if document.details else {}
        }

        # inter-dataset suppression
        if 'replaced_by' in document.suppression:
            new_kwargs = dict(kwargs)
            new_kwargs['document_id'] = document.suppression['replaced_by'][0]
            out['redirect_to'] = reverse('document-view', kwargs=new_kwargs)

        # comment-on metadata
        if document.comment_on:
            # if we don't have all the data built in, grab it from its original record
            comment_on_doc = document.comment_on if 'title' in document.comment_on else Doc.objects.get(id=document.comment_on['document_id']).to_mongo()
            out['comment_on'] = {
                "fr_doc": comment_on_doc.get('fr_doc', False),  
                "type": comment_on_doc.get('type', None), 
                "id": document.comment_on['document_id'],
                'url': reverse('document-view', kwargs={'document_id': document.comment_on['document_id']}),
                "title": comment_on_doc['title']
            }
            if comment_on_doc['agency'] == out['agency']['id'] or not comment_on_doc['agency']:
                out['comment_on']['agency'] = out['agency']
            else:
                out['comment_on']['agency'] = {
                    'id': comment_on_doc['agency'],
                    'url': reverse('agency-view', kwargs={'agency': comment_on_doc['agency']}),
                    'name': Agency.objects(id=comment_on_doc['agency']).only("name")[0].name
                }
        else:
            out['comment_on'] = {}

        # docket metadata
        docket = Docket.objects(id=document.docket_id)[0]
        out['docket'] = {
            'id': document.docket_id,
            'url': reverse('docket-view', kwargs={'docket_id': document.docket_id}),
            'title': docket.title,
            'weeks': [],
            'fr_docs': []
        }
        if docket.stats:
            out['docket']['weeks'] = prettify_weeks(docket.stats['weeks'])
            out['docket']['fr_docs'] = docket.stats['doc_info'].get('fr_docs', [])

        if out['date']:
            out['date'] = out['date'].isoformat()

        text_entities = set()
        submitter_entities = set(document.submitter_entities if document.submitter_entities else [])
        
        # a weird thing happens with iterating over mongoengine lists where they lose references to their parent instances, so do this weird generator thing
        for view in (document.views[i] for i in xrange(len(document.views))):
            # hack to deal with documents whose scrapes failed but still got extracted
            object_id = document.object_id if document.object_id else view.file_path.split('/')[-1].split('.')[0]
            out['views'].append({
                'object_id': object_id,
                'file_type': view.type,
                'file_type_label': TYPE_LABELS.get(view.type, view.type.upper()),
                'extracted': view.extracted == 'yes',
                'url': view.download_url,
                'html': reverse('raw-text-view', kwargs={'document_id': document.id, 'file_type': view.type, 'output_format': 'html', 'view_type': 'view'}) if view.extracted == 'yes' else None
            })

            for entity in view.entities:
                text_entities.add(entity)

        for attachment in (document.attachments[i] for i in xrange(len(document.attachments))):
            a = {
                'title': attachment.title,
                'views': []
            }
            for view in (attachment.views[i] for i in xrange(len(attachment.views))):
                a['views'].append({
                    'object_id': attachment.object_id,
                    'file_type': view.type,
                    'file_type_label': TYPE_LABELS.get(view.type, view.type.upper()),
                    'extracted': view.extracted == 'yes',
                    'url': view.download_url,
                    'html': reverse('raw-text-view', kwargs={'document_id': document.id, 'object_id': attachment.object_id, 'file_type': view.type, 'output_format': 'html', 'view_type': 'attachment'}) if view.extracted == 'yes' else None
                })

                for entity in view.entities:
                    text_entities.add(entity)
            out['attachments'].append(a)

        # stats for FR docs
        stats = document.stats if document.stats else {'count': 0}
        # limit ourselves to the top five of each match type, and grab their extra metadata
        for label in ['text_entities', 'submitter_entities']:
            stats['top_' + label] = [{
                'id': i[0],
                'count': i[1]
            } for i in sorted(stats.get(label, {}).items(), key=lambda x: x[1], reverse=True)[:5]]
            if label in stats:
                del stats[label]
        top_entities = set([record['id'] for record in stats['top_text_entities']] + [record['id'] for record in stats['top_submitter_entities']])

        entities_search = Entity.objects(id__in=list(submitter_entities.union(text_entities, top_entities))).only('id', 'td_type', 'aliases')
        entities = dict([(entity.id, entity) for entity in entities_search])

        for label, items in [('submitter_entities', sorted(list(submitter_entities))), ('text_entities', sorted(list(text_entities)))]:
            out[label] = [{
                'id': item,
                'type': entities[item].td_type,
                'name': entities[item].aliases[0],
                'url': '/%s/%s/%s' % (entities[item].td_type, slugify(entities[item].aliases[0]), item)
            } for item in items]

        for label in ['top_text_entities', 'top_submitter_entities']:
            for entity in stats[label]:
                if not entities[entity['id']].td_type:
                    continue
                
                entity['type'] = entities[entity['id']].td_type
                entity['name'] = entities[entity['id']].aliases[0]
                entity['url'] = '/%s/%s/%s' % (entity['type'], slugify(entity['name']), entity['id'])

        if 'weeks' in stats:
            stats['weeks'] = prettify_weeks(stats['weeks'])

        recent_comments = []
        if 'recent_comments' in stats:
            recent_comments_search = Doc.objects(id__in=[doc['id'] for doc in stats['recent_comments']]).only('id', 'title', 'details')
            for comment in recent_comments_search:
                comment_item = {
                    'title': comment.title,
                    'date': comment.details['Date_Posted'].date().isoformat() if 'Date_Posted' in comment.details else None,
                    'author': " ".join([comment.details.get('First_Name', ''), comment.details.get('Last_Name', '')]).strip(),
                    'organization': comment.details.get('Organization_Name', ''),
                    'url': '/document/' + comment.id
                }
                comment_item['author'] = comment_item['author'] if comment_item['author'] else None
                recent_comments.append(comment_item)

        stats['recent_comments'] = recent_comments

        out['comment_stats'] = stats

        # links upstream
        out['source'] = document.source
        out['upstream_urls'] = []
        if out['source'] == 'regulations.gov':
            out['upstream_urls'].append({
                'url': 'http://www.regulations.gov/#!documentDetail;D=' + document.id,
                'label': 'Regulations.gov'
            })
        elif out['source'] == 'sec_cftc':
            for replaced in document.suppression.get('replaces', []):
                out['upstream_urls'].append({
                    'url': 'http://www.regulations.gov/#!documentDetail;D=' + replaced,
                    'label': 'Regulations.gov'
                })

        # cleaned-up details
        details = out['details'].copy()
        dp = lambda key, default=None: details.pop(key, default)
        out['clean_details'] = dtls(
            ('Submitter Information', dtls(
                ('Name', combine(dp('First_Name'), dp('Middle_Name'), dp('Last_Name'))),
                ('Organization', dp('Organization_Name')),
                ('Location', combine(dp('Mailing_Address'), dp('Mailing_Address_'), dp('City'), expand_state(dp('State_or_Province')), dp('Postal_Code'), dp('Country'), sep=", ")),
                ('Email Address', dp('Email_Address')),
                ('Phone Number', dp('Phone_Number')),
                ('Fax Number', dp('Fax_Number')),
                ("Submitter's Representative", dp('Submitter_s_Representative'))
            )),

            ('Dates and Times', dtls(
                ('Document Date', dp('Document_Date')), # rarely-used
                ('Date Received', dp('Received_Date')),
                ('Postmark Date', dp('Postmark_Date', dp('Post_Mark_Date'))),
                ('Date Posted', dp('Date_Posted')),
                (None, dp('Date')), # Swallow this one, since it's always the same as Date_Posted,
                ('Comment Period', combine(
                    short_date(force_date(dp('Comment_Start_Date'))),
                    short_date(force_date(dp('Comment_Due_Date'))),
                    sep="&ndash;"
                )),

                # all the other dates -- don't even know what most of these are
                ("File Date", dp("File_Date")),
                ("Answer Date", dp("Answer_Date")),
                ("Author Date", dp("Author_Date")),
                ("Author Document Date", dp("Author_Document_Date")),
                ("Effective Date", dp("Effective_Date")),
                ("Implementation Date", dp("Implementation_Date")),
                ("Implementation Service Date", dp("Implementation_Service_Date"))
            )),
            
            ('Citations and References', dtls(
                ("RIN", document.rin if document.rin else None),
                ("Federal Register No.", dp("Federal_Register_Number")),
                ("Federal Register Pages", dp("Start_End_Page", "").replace(" - ", "&ndash;")),
                (None, dp("Page_Count")), # who cares?
                (None, dp("Page_Start")), # who cares?
                ("Federal Register Citation", dp("Federal_Register_Citation")),
                ("CFR Section(s)", dp("CFR")),
                ("Related RINs", dp("Related_RIN_s_")),
            )),
            
            ('Additional Details', dtls(*details.items()))
        )

        return Response(out)
Beispiel #9
0
    def get(self, request, entity_id, docket_id, document_type, entity_type):
        dkt_results = list(Docket.objects(id=docket_id).only('id', 'title'))
        ent_results = list(Entity.objects(id=entity_id).only('id', 'aliases'))
        if not dkt_results or not ent_results:
            raise Http404('Not found.')

        docket = dkt_results[0]
        entity = ent_results[0]

        if document_type == 'mentions':
            docs_q = Doc.objects(Q(attachments__views__entities=entity_id)
                                 | Q(views__entities=entity_id),
                                 docket_id=docket_id)
        else:
            docs_q = Doc.objects(submitter_entities=entity_id, docket_id=docket_id) \

        docs_q = docs_q.only('type', 'title', 'id', 'views',
                             'attachments.views', 'details.Date_Posted',
                             'deleted').hint([("docket_id", 1)])
        docs = filter(
            lambda d: not d.deleted,
            sorted(list(docs_q),
                   key=lambda doc: doc.details.get(
                       'Date_Posted', datetime.datetime(1900, 1, 1)),
                   reverse=True))

        get_views = lambda doc: [
            {
                'object_id': view.object_id,
                'file_type': view.type,
                'url': view.url.replace('inline', 'attachment')
            } for view in doc.views if entity_id in view.entities
        ]

        out_docs = []
        for doc in docs[:10]:
            out_doc = {
                'title': doc.title,
                'id': doc.id,
                'date_posted': doc.details['Date_Posted'],
                'type': doc.type,
                'url': '/document/' + doc.id
            }
            if document_type == 'mentions':
                out_doc['files'] = get_views(doc) + list(
                    itertools.chain.from_iterable([
                        get_views(attachment) for attachment in doc.attachments
                    ]))

            out_docs.append(out_doc)

        return Response({
            'documents': out_docs,
            'has_more': len(docs) > 10,
            'count': len(docs),
            'document_search_url': "/search-document/" + \
                url_quote(":".join(["mentioned" if document_type == "mentions" else "submitter", entity.id, '"%s"' % entity.aliases[0]])) + \
                url_quote(":".join(["docket", docket.id, '"%s"' % docket.title])),
            'docket': {
                'id': docket.id,
                'title': docket.title,
            },
            'entity': {
                'id': entity.id,
                'name': entity.aliases[0]
            },
            'filter_type': document_type
        })
Beispiel #10
0
    def get(self, request, *args, **kwargs):
        "Access aggregate information about entities as they occur in regulations.gov data."
        results = Entity.objects(id=kwargs['entity_id'])
        if not results:
            raise Http404('Docket not found.')

        entity = results[0]

        # basic docket metadata
        out = {
            'name': entity.aliases[0],
            'url': reverse('entity-view', args=args, kwargs=kwargs),
            'id': entity.id,
            'type': entity.td_type,
            'stats': entity.stats
        }

        stats = entity.stats
        if stats:
            # cleanup, plus stitch on some additional data
            now = datetime.datetime.now().date()
            for mention_type in ["text_mentions", "submitter_mentions"]:
                stats[mention_type].update({
                    'months': [
                        month for month in prettify_months(stats[mention_type]
                                                           ['months'])
                        if month['date_range'][0] <= now
                    ] if stats[mention_type]['months'] else [],
                })

                # limit ourselves to the top ten of each match type, and grab their extra metadata
                agencies = sorted(stats[mention_type]['agencies'].items(),
                                  key=lambda x: x[1],
                                  reverse=True)[:10]

                stats[mention_type]['top_agencies'] = [{
                    'id':
                    item[0],
                    'count':
                    item[1],
                    'months':
                    prettify_months(
                        stats[mention_type]['agencies_by_month'][item[0]])
                } for item in agencies]
                del stats[mention_type]['agencies'], stats[mention_type][
                    'agencies_by_month']

                docket_list = stats[mention_type]['dockets'].items()
                years = request.GET.get('years', None)
                if years:
                    year_set = set(years.split(","))
                    docket_list = [
                        item for item in docket_list
                        if get_docket_year(item[0]) in year_set
                    ]
                dockets = sorted(docket_list, key=lambda x: x[1],
                                 reverse=True)[:10]

                stats[mention_type]['top_dockets'] = [{
                    'id': item[0],
                    'count': item[1]
                } for item in dockets]

                stats[mention_type]['docket_count'] = len(docket_list)
                del stats[mention_type]['dockets']

                stats[mention_type][
                    'docket_search_url'] = "/search-docket/" + url_quote(
                        ":".join([
                            "mentioned" if mention_type == "text_mentions" else
                            "submitter", entity.id,
                            '"%s"' % entity.aliases[0]
                        ]))

            # grab additional docket metadata
            ids = list(
                set([
                    record['id']
                    for record in stats['submitter_mentions']['top_dockets']
                ] + [
                    record['id']
                    for record in stats['text_mentions']['top_dockets']
                ]))
            dockets_search = Docket.objects(id__in=ids).only(
                'id', 'title', 'year', 'details.dk_type', 'agency',
                'stats.date_range')
            dockets = dict([(docket.id, docket) for docket in dockets_search])

            # stitch this back onto the main records
            for mention_type in ['text_mentions', 'submitter_mentions']:
                for docket in stats[mention_type]['top_dockets']:
                    rdocket = dockets[docket['id']]
                    docket.update({
                        'title':
                        rdocket.title,
                        'url':
                        reverse('docket-view',
                                kwargs={'docket_id': rdocket.id}),
                        'year':
                        rdocket.year if rdocket.year else
                        (getattr(rdocket.stats['date_range'][0], 'year', None)
                         if 'date_range' in rdocket.stats else None),
                        'rulemaking':
                        rdocket.details.get(
                            'Type', 'Nonrulemaking').lower() == 'rulemaking',
                        'agency':
                        rdocket.agency if rdocket.agency else re.split(
                            "[-_]", rdocket.id)[0]
                    })

            # repeat for agencies
            ids = list(
                set([
                    record['id']
                    for record in stats['submitter_mentions']['top_agencies']
                ] + [
                    record['id']
                    for record in stats['text_mentions']['top_agencies']
                ]))
            agencies_search = Agency.objects(id__in=ids).only('id', 'name')
            agencies = dict([(agency.id, agency)
                             for agency in agencies_search])

            # ...and stitch
            for mention_type in ['text_mentions', 'submitter_mentions']:
                for agency in stats[mention_type]['top_agencies']:
                    ragency = agencies.get(agency['id'], None)
                    agency.update({
                        'name':
                        ragency.name if ragency else agency['id'],
                        'url':
                        '/agency/%s' % agency['id']
                    })

            # and for comments
            recent_comments = []
            if 'recent_comments' in stats['submitter_mentions']:
                recent_comments_search = Doc.objects(id__in=[
                    doc['id']
                    for doc in stats['submitter_mentions']['recent_comments']
                ]).only('id', 'title', 'details')
                for comment in recent_comments_search:
                    comment_item = {
                        'title':
                        comment.title,
                        'date':
                        comment.details['Date_Posted'].date().isoformat()
                        if 'Date_Posted' in comment.details else None,
                        'author':
                        " ".join([
                            comment.details.get('First_Name', ''),
                            comment.details.get('Last_Name', '')
                        ]).strip(),
                        'organization':
                        comment.details.get('Organization_Name', ''),
                        'url':
                        '/document/' + comment.id
                    }
                    comment_item['author'] = comment_item[
                        'author'] if comment_item['author'] else None
                    recent_comments.append(comment_item)

            stats['submitter_mentions']['recent_comments'] = recent_comments

            out['stats'] = stats
        else:
            out['stats'] = {'count': 0}

        return Response(out)
Beispiel #11
0
    def get(self, request, *args, **kwargs):
        "Access basic metadata about regulations.gov dockets."

        results = list(
            self.aggregation_class.objects(id=kwargs[self.aggregation_field]))
        if not results:
            raise Http404('%s not found.' % self.aggregation_level.title())

        self.item = item = results[0]

        # basic docket metadata
        out = {
            'url': reverse('%s-view' % self.aggregation_level, kwargs=kwargs),
            'id': item.id,
            'type': self.aggregation_level
        }
        for label in ['name', 'title', 'year']:
            if hasattr(item, label):
                out[label] = getattr(item, label)
        rulemaking_field = getattr(item, 'details', {}).get('Type', None)
        if rulemaking_field:
            out['rulemaking'] = rulemaking_field.lower() == 'rulemaking'

        if 'replaced_by' in getattr(item, 'suppression', {}):
            new_kwargs = dict(kwargs)
            new_kwargs[
                self.aggregation_field] = item.suppression['replaced_by'][0]
            out['redirect_to'] = reverse('%s-view' % self.aggregation_level,
                                         kwargs=new_kwargs)

        stats = item.stats
        if stats:
            # cleanup, plus stitch on some additional data
            stats["type_breakdown"] = dict([
                (doc_type, stats["type_breakdown"].get(doc_type, 0))
                for doc_type in Doc.type.choices
            ])

            if 'weeks' in stats and len(stats['weeks']) != 0:
                now = datetime.datetime.now().date()
                stats['weeks'] = [
                    week for week in prettify_weeks(stats['weeks'])
                    if week['date_range'][0] <= now
                ]

            if 'months' in stats and len(stats['months']) != 0:
                now = datetime.datetime.now().date()
                stats['months'] = [
                    month for month in prettify_months(stats['months'])
                    if month['date_range'][0] <= now
                ]

            # limit ourselves to the top five of each match type, and grab their extra metadata
            for label, items in [
                ('top_text_entities', stats['text_entities'].items()),
                ('top_submitter_entities', stats['submitter_entities'].items())
            ]:
                stats[label] = [{
                    'id': i[0],
                    'count': i[1]
                } for i in sorted(items, key=lambda x: x[1], reverse=True)[:10]
                                ]
                stats[label[4:] + "_count"] = len(items)
            del stats['text_entities'], stats['submitter_entities']

            # grab additional info about these ones from the database
            ids = list(
                set([record['id'] for record in stats['top_text_entities']] + [
                    record['id'] for record in stats['top_submitter_entities']
                ]))
            entities_search = Entity.objects(id__in=ids).only(
                'id', 'td_type', 'aliases')
            entities = dict([(entity.id, entity)
                             for entity in entities_search])

            # stitch this back onto the main records
            for label in ['top_text_entities', 'top_submitter_entities']:
                filtered_entities = []
                for entity in stats[label]:
                    if not entities[entity['id']].td_type or entities[
                            entity['id']].td_type != 'organization':
                        continue

                    entity['type'] = entities[entity['id']].td_type
                    entity['name'] = entities[entity['id']].aliases[0]
                    entity['url'] = '/%s/%s/%s' % (
                        entity['type'], slugify(entity['name']), entity['id'])
                    filtered_entities.append(entity)
                stats[label] = filtered_entities[:5]

            out['stats'] = stats
        else:
            out['stats'] = {
                'count':
                0,
                'type_breakdown':
                dict([(doc_type, 0) for doc_type in Doc.type.choices])
            }

        return Response(out)
Beispiel #12
0
    def get(self, request, *args, **kwargs):
        "Access basic metadata about regulations.gov documents."
        results = list(Doc.objects(id=kwargs['document_id']))
        if not results or results[0].deleted:
            raise Http404('Document not found.')

        document = results[0]

        # basic document metadata
        out = {
            'title': document.title,
            'url': reverse('document-view', kwargs=kwargs),
            'id': document.id,
            'agency': {
                'id': document.agency,
                'url': reverse('agency-view',
                               kwargs={'agency': document.agency}),
                'name': Agency.objects(id=document.agency).only("name")[0].name
            },
            'date': document.details.get('Date_Posted', None),
            'type': document.type,
            'views': [],
            'attachments': [],
            'details': document.details if document.details else {}
        }

        # inter-dataset suppression
        if 'replaced_by' in document.suppression:
            new_kwargs = dict(kwargs)
            new_kwargs['document_id'] = document.suppression['replaced_by'][0]
            out['redirect_to'] = reverse('document-view', kwargs=new_kwargs)

        # comment-on metadata
        if document.comment_on:
            # if we don't have all the data built in, grab it from its original record
            comment_on_doc = document.comment_on if 'title' in document.comment_on else Doc.objects.get(
                id=document.comment_on['document_id']).to_mongo()
            out['comment_on'] = {
                "fr_doc":
                comment_on_doc.get('fr_doc', False),
                "type":
                comment_on_doc.get('type', None),
                "id":
                document.comment_on['document_id'],
                'url':
                reverse(
                    'document-view',
                    kwargs={'document_id':
                            document.comment_on['document_id']}),
                "title":
                comment_on_doc['title']
            }
            if comment_on_doc['agency'] == out['agency'][
                    'id'] or not comment_on_doc['agency']:
                out['comment_on']['agency'] = out['agency']
            else:
                out['comment_on']['agency'] = {
                    'id':
                    comment_on_doc['agency'],
                    'url':
                    reverse('agency-view',
                            kwargs={'agency': comment_on_doc['agency']}),
                    'name':
                    Agency.objects(
                        id=comment_on_doc['agency']).only("name")[0].name
                }
        else:
            out['comment_on'] = {}

        # docket metadata
        docket = Docket.objects(id=document.docket_id)[0]
        out['docket'] = {
            'id': document.docket_id,
            'url': reverse('docket-view',
                           kwargs={'docket_id': document.docket_id}),
            'title': docket.title,
            'weeks': [],
            'fr_docs': []
        }
        if docket.stats:
            out['docket']['weeks'] = prettify_weeks(docket.stats['weeks'])
            out['docket']['fr_docs'] = docket.stats['doc_info'].get(
                'fr_docs', [])

        if out['date']:
            out['date'] = out['date'].isoformat()

        text_entities = set()
        submitter_entities = set(
            document.submitter_entities if document.submitter_entities else [])

        # a weird thing happens with iterating over mongoengine lists where they lose references to their parent instances, so do this weird generator thing
        for view in (document.views[i] for i in xrange(len(document.views))):
            # hack to deal with documents whose scrapes failed but still got extracted
            object_id = document.object_id if document.object_id else view.file_path.split(
                '/')[-1].split('.')[0]
            out['views'].append({
                'object_id':
                object_id,
                'file_type':
                view.type,
                'file_type_label':
                TYPE_LABELS.get(view.type, view.type.upper()),
                'extracted':
                view.extracted == 'yes',
                'url':
                view.download_url,
                'html':
                reverse('raw-text-view',
                        kwargs={
                            'document_id': document.id,
                            'file_type': view.type,
                            'output_format': 'html',
                            'view_type': 'view'
                        }) if view.extracted == 'yes' else None
            })

            for entity in view.entities:
                text_entities.add(entity)

        for attachment in (document.attachments[i]
                           for i in xrange(len(document.attachments))):
            a = {'title': attachment.title, 'views': []}
            for view in (attachment.views[i]
                         for i in xrange(len(attachment.views))):
                a['views'].append({
                    'object_id':
                    attachment.object_id,
                    'file_type':
                    view.type,
                    'file_type_label':
                    TYPE_LABELS.get(view.type, view.type.upper()),
                    'extracted':
                    view.extracted == 'yes',
                    'url':
                    view.download_url,
                    'html':
                    reverse('raw-text-view',
                            kwargs={
                                'document_id': document.id,
                                'object_id': attachment.object_id,
                                'file_type': view.type,
                                'output_format': 'html',
                                'view_type': 'attachment'
                            }) if view.extracted == 'yes' else None
                })

                for entity in view.entities:
                    text_entities.add(entity)
            out['attachments'].append(a)

        # stats for FR docs
        stats = document.stats if document.stats else {'count': 0}
        # limit ourselves to the top five of each match type, and grab their extra metadata
        for label in ['text_entities', 'submitter_entities']:
            stats['top_' + label] = [{
                'id': i[0],
                'count': i[1]
            } for i in sorted(stats.get(label, {}).items(),
                              key=lambda x: x[1],
                              reverse=True)[:5]]
            if label in stats:
                del stats[label]
        top_entities = set(
            [record['id'] for record in stats['top_text_entities']] +
            [record['id'] for record in stats['top_submitter_entities']])

        entities_search = Entity.objects(id__in=list(
            submitter_entities.union(text_entities, top_entities))).only(
                'id', 'td_type', 'aliases')
        entities = dict([(entity.id, entity) for entity in entities_search])

        for label, items in [('submitter_entities',
                              sorted(list(submitter_entities))),
                             ('text_entities', sorted(list(text_entities)))]:
            out[label] = [{
                'id':
                item,
                'type':
                entities[item].td_type,
                'name':
                entities[item].aliases[0],
                'url':
                '/%s/%s/%s' % (entities[item].td_type,
                               slugify(entities[item].aliases[0]), item)
            } for item in items]

        for label in ['top_text_entities', 'top_submitter_entities']:
            for entity in stats[label]:
                if not entities[entity['id']].td_type:
                    continue

                entity['type'] = entities[entity['id']].td_type
                entity['name'] = entities[entity['id']].aliases[0]
                entity['url'] = '/%s/%s/%s' % (
                    entity['type'], slugify(entity['name']), entity['id'])

        if 'weeks' in stats:
            stats['weeks'] = prettify_weeks(stats['weeks'])

        recent_comments = []
        if 'recent_comments' in stats:
            recent_comments_search = Doc.objects(
                id__in=[doc['id'] for doc in stats['recent_comments']]).only(
                    'id', 'title', 'details')
            for comment in recent_comments_search:
                comment_item = {
                    'title':
                    comment.title,
                    'date':
                    comment.details['Date_Posted'].date().isoformat()
                    if 'Date_Posted' in comment.details else None,
                    'author':
                    " ".join([
                        comment.details.get('First_Name', ''),
                        comment.details.get('Last_Name', '')
                    ]).strip(),
                    'organization':
                    comment.details.get('Organization_Name', ''),
                    'url':
                    '/document/' + comment.id
                }
                comment_item['author'] = comment_item[
                    'author'] if comment_item['author'] else None
                recent_comments.append(comment_item)

        stats['recent_comments'] = recent_comments

        out['comment_stats'] = stats

        # links upstream
        out['source'] = document.source
        out['upstream_urls'] = []
        if out['source'] == 'regulations.gov':
            out['upstream_urls'].append({
                'url':
                'http://www.regulations.gov/#!documentDetail;D=' + document.id,
                'label':
                'Regulations.gov'
            })
        elif out['source'] == 'sec_cftc':
            for replaced in document.suppression.get('replaces', []):
                out['upstream_urls'].append({
                    'url':
                    'http://www.regulations.gov/#!documentDetail;D=' +
                    replaced,
                    'label':
                    'Regulations.gov'
                })

        # cleaned-up details
        details = out['details'].copy()
        dp = lambda key, default=None: details.pop(key, default)
        out['clean_details'] = dtls(
            ('Submitter Information',
             dtls(('Name',
                   combine(dp('First_Name'), dp('Middle_Name'),
                           dp('Last_Name'))),
                  ('Organization', dp('Organization_Name')),
                  ('Location',
                   combine(dp('Mailing_Address'),
                           dp('Mailing_Address_'),
                           dp('City'),
                           expand_state(dp('State_or_Province')),
                           dp('Postal_Code'),
                           dp('Country'),
                           sep=", ")), ('Email Address', dp('Email_Address')),
                  ('Phone Number', dp('Phone_Number')),
                  ('Fax Number', dp('Fax_Number')),
                  ("Submitter's Representative",
                   dp('Submitter_s_Representative')))),
            (
                'Dates and Times',
                dtls(
                    ('Document Date', dp('Document_Date')),  # rarely-used
                    ('Date Received', dp('Received_Date')),
                    ('Postmark Date', dp('Postmark_Date',
                                         dp('Post_Mark_Date'))),
                    ('Date Posted', dp('Date_Posted')),
                    (
                        None, dp('Date')
                    ),  # Swallow this one, since it's always the same as Date_Posted,
                    ('Comment Period',
                     combine(short_date(force_date(dp('Comment_Start_Date'))),
                             short_date(force_date(dp('Comment_Due_Date'))),
                             sep="&ndash;")),

                    # all the other dates -- don't even know what most of these are
                    ("File Date", dp("File_Date")),
                    ("Answer Date", dp("Answer_Date")),
                    ("Author Date", dp("Author_Date")),
                    ("Author Document Date", dp("Author_Document_Date")),
                    ("Effective Date", dp("Effective_Date")),
                    ("Implementation Date", dp("Implementation_Date")),
                    ("Implementation Service Date",
                     dp("Implementation_Service_Date")))),
            (
                'Citations and References',
                dtls(
                    ("RIN", document.rin if document.rin else None),
                    ("Federal Register No.", dp("Federal_Register_Number")),
                    ("Federal Register Pages", dp(
                        "Start_End_Page", "").replace(" - ", "&ndash;")),
                    (None, dp("Page_Count")),  # who cares?
                    (None, dp("Page_Start")),  # who cares?
                    ("Federal Register Citation",
                     dp("Federal_Register_Citation")),
                    ("CFR Section(s)", dp("CFR")),
                    ("Related RINs", dp("Related_RIN_s_")),
                )),
            ('Additional Details', dtls(*details.items())))

        return Response(out)