Esempi in Python per Docket, esempi in Python per regs_models.Docket

Esempio n. 1

0

Mostra file

File: analyze_regs.py Progetto: imclab/cluster-explorer

    def handle(self, **options):        
        if options.get('docket'):
            dockets = Docket.objects(id=options['docket'])
        elif options.get('agency'):
            dockets = Docket.objects(agency=options['agency'])
        else:
        	dockets = Docket.objects()

        print "Enumerating dockets..."
        docket_list = list(dockets.only('id', 'agency'))
        docket_count = len(docket_list)
        counter = 0

        print "Beginning loading %s dockets at %s..." % (docket_count, datetime.now())

        if options['fork']:
            print "Using forking strategy..."
            import multiprocessing
            for docket in docket_list:
                counter += 1
                print "Docket #%s / %s" % (counter, docket_count)

                p = multiprocessing.Process(target=process_docket, args=[docket, options])
                p.start()
                p.join()
        else:
            print "Using single-process strategy..."
            for docket in docket_list:
                counter += 1
                print "Docket #%s / %s" % (counter, docket_count)

                process_docket(docket, options)

        print "Done."

Esempio n. 2

0

Mostra file

File: views.py Progetto: thecapacity/sparerib

    def get(self, request, *args, **kwargs):
        out = super(AgencyView, self).get(request, *args, **kwargs).data

        agency = self.item.id

        for label, order in [('recent_dockets', '-stats.date_range.0'),
                             ('popular_dockets', '-stats.count')]:
            docket_query = {'agency': agency}

            # for SEC and CFTC, ignore Regulations.gov dockets here because they're often broken
            if agency in ('SEC', 'CFTC'):
                docket_query['source'] = 'sec_cftc'

            dockets = Docket.objects(**docket_query).order_by(order).only(
                'title', 'stats.date_range', 'stats.type_breakdown',
                'stats.count').limit(5)
            out[label] = [{
                'date_range': [
                    docket.stats['date_range'][0],
                    min(datetime.datetime.now(), docket.stats['date_range'][1])
                ],
                'count':
                docket.stats['count'],
                'comment_count':
                docket.stats['type_breakdown'].get('public_submission', 0),
                'title':
                docket.title,
                'id':
                docket.id
            } for docket in dockets]

        return Response(out)

Esempio n. 3

0

Mostra file

File: analyze_regs.py Progetto: sunlightlabs/cluster-explorer

def ingest_single_parse(docket_id, deletions, insertions, parser):
    if parser not in ('sentence', '4-gram'):
        raise "Parser must be one of 'sentence' or '4-gram'. Got '%s'." % parser

    corpora = get_corpora_by_metadata('docket_id', docket_id)

    parsed_corpora = [c for c in corpora if c.metadata.get('parser') == parser]

    if len(parsed_corpora) == 0:
        dockets = list(Docket.objects(id=docket_id).only('agency'))
        docket = dockets[0] if dockets else Docket()
        c = Corpus(metadata=dict(docket_id=docket_id, agency_id=docket.agency if docket.agency else docket_id.split("-")[0], parser=parser))
        print "Created new corpus #%s for %s parse." % (c.id, parser)
    
    elif len(parsed_corpora) == 1:
        c = parsed_corpora[0]
        print "Updating existing corpus #%s for %s parse." % (c.id, parser)
        
        print "Deleting documents at %s..." % datetime.now()
        c.delete_by_metadata('document_id', deletions + [d['metadata']['document_id'] for d in insertions])
    
    else:
        raise "More than one sentence parse for docket %s found. Shouldn't happen--will need ot manually remove extra corpora." % docket_id
    
    print "Inserting documents at %s..." % datetime.now()
    if parser == 'sentence':
        i = DocumentIngester(c, parser=sentence_parse, compute_similarities=False)
    elif parser == '4-gram':
        i = DocumentIngester(c, parser=ngram_parser(4), compute_similarities=True)
    i.ingest(insertions)

    print "Removing hierarchy, if cached, at %s..." % datetime.now()
    c.delete_hierarchy_cache()

Esempio n. 4

0

Mostra file

def ingest_single_parse(docket_id, deletions, insertions, parser):
    if parser not in ('sentence', '4-gram'):
        raise "Parser must be one of 'sentence' or '4-gram'. Got '%s'." % parser

    corpora = get_corpora_by_metadata('docket_id', docket_id)

    parsed_corpora = [c for c in corpora if c.metadata.get('parser') == parser]

    if len(parsed_corpora) == 0:
        dockets = list(Docket.objects(id=docket_id).only('agency'))
        docket = dockets[0] if dockets else Docket()
        c = Corpus(metadata=dict(docket_id=docket_id,
                                 agency_id=docket.agency if docket.
                                 agency else docket_id.split("-")[0],
                                 parser=parser))
        print "Created new corpus #%s for %s parse." % (c.id, parser)

    elif len(parsed_corpora) == 1:
        c = parsed_corpora[0]
        print "Updating existing corpus #%s for %s parse." % (c.id, parser)

        print "Deleting documents at %s..." % datetime.now()
        c.delete_by_metadata(
            'document_id',
            deletions + [d['metadata']['document_id'] for d in insertions])

    else:
        raise "More than one sentence parse for docket %s found. Shouldn't happen--will need ot manually remove extra corpora." % docket_id

    print "Inserting documents at %s..." % datetime.now()
    if parser == 'sentence':
        i = DocumentIngester(c,
                             parser=sentence_parse,
                             compute_similarities=False)
    elif parser == '4-gram':
        i = DocumentIngester(c,
                             parser=ngram_parser(4),
                             compute_similarities=True)
    i.ingest(insertions)

    print "Removing hierarchy, if cached, at %s..." % datetime.now()
    c.delete_hierarchy_cache()

Esempio n. 5

0

Mostra file

File: views.py Progetto: apendleton/sparerib

    def get(self, request, entity_id, docket_id, document_type, entity_type):
        dkt_results = list(Docket.objects(id=docket_id).only('id', 'title'))
        ent_results = list(Entity.objects(id=entity_id).only('id', 'aliases'))
        if not dkt_results or not ent_results:
            raise Http404('Not found.')

        docket = dkt_results[0]
        entity = ent_results[0]

        if document_type == 'mentions':
            docs_q = Doc.objects(Q(attachments__views__entities=entity_id) | Q(views__entities=entity_id), docket_id=docket_id)
        else:
            docs_q = Doc.objects(submitter_entities=entity_id, docket_id=docket_id) \

        docs_q = docs_q.only('type', 'title', 'id', 'views', 'attachments.views', 'details.Date_Posted', 'deleted').hint([("docket_id", 1)])
        docs = filter(lambda d: not d.deleted, sorted(list(docs_q), key=lambda doc: doc.details.get('Date_Posted', datetime.datetime(1900,1,1)), reverse=True))

        get_views = lambda doc: [{
            'object_id': view.object_id,
            'file_type': view.type,
            'url': view.url.replace('inline', 'attachment')
        } for view in doc.views if entity_id in view.entities]

        out_docs = []
        for doc in docs[:10]:
            out_doc = {
                'title': doc.title,
                'id': doc.id,
                'date_posted': doc.details['Date_Posted'],
                'type': doc.type,
                'url': '/document/' + doc.id
            }
            if document_type == 'mentions':
                out_doc['files'] = get_views(doc) + list(itertools.chain.from_iterable([get_views(attachment) for attachment in doc.attachments]))

            out_docs.append(out_doc)

        return Response({
            'documents': out_docs,
            'has_more': len(docs) > 10,
            'count': len(docs),
            'document_search_url': "/search-document/" + \
                url_quote(":".join(["mentioned" if document_type == "mentions" else "submitter", entity.id, '"%s"' % entity.aliases[0]])) + \
                url_quote(":".join(["docket", docket.id, '"%s"' % docket.title])),
            'docket': {
                'id': docket.id,
                'title': docket.title,
            },
            'entity': {
                'id': entity.id,
                'name': entity.aliases[0]
            },
            'filter_type': document_type
        })

Esempio n. 6

0

Mostra file

File: views.py Progetto: imclab/sparerib

    def get(self, request, *args, **kwargs):
        out = super(AgencyView, self).get(request, *args, **kwargs).data

        agency = self.item.id

        for label, order in [('recent_dockets', '-stats.date_range.0'), ('popular_dockets', '-stats.count')]:
            dockets = Docket.objects(agency=agency).order_by(order).only('title', 'stats.date_range', 'stats.type_breakdown', 'stats.count').limit(5)
            out[label] = [{
                'date_range': docket.stats['date_range'],
                'count': docket.stats['count'],
                'comment_count': docket.stats['type_breakdown'].get('public_submission', 0),
                'title': docket.title,
                'id': docket.id
            } for docket in dockets]

        return Response(out)

Esempio n. 7

0

Mostra file

File: views.py Progetto: apendleton/sparerib

    def get(self, request, *args, **kwargs):
        out = super(AgencyView, self).get(request, *args, **kwargs).data

        agency = self.item.id

        for label, order in [('recent_dockets', '-stats.date_range.0'), ('popular_dockets', '-stats.count')]:
            docket_query = {'agency': agency}

            # for SEC and CFTC, ignore Regulations.gov dockets here because they're often broken
            if agency in ('SEC', 'CFTC'):
                docket_query['source'] = 'sec_cftc'

            dockets = Docket.objects(**docket_query).order_by(order).only('title', 'stats.date_range', 'stats.type_breakdown', 'stats.count').limit(5)
            out[label] = [{
                'date_range': [docket.stats['date_range'][0], min(datetime.datetime.now(), docket.stats['date_range'][1])],
                'count': docket.stats['count'],
                'comment_count': docket.stats['type_breakdown'].get('public_submission', 0),
                'title': docket.title,
                'id': docket.id
            } for docket in dockets]

        return Response(out)

Esempio n. 8

0

Mostra file

File: views.py Progetto: apendleton/sparerib

    def get(self, request, *args, **kwargs):
        "Access aggregate information about entities as they occur in regulations.gov data."
        results = Entity.objects(id=kwargs['entity_id'])
        if not results:
            raise Http404('Docket not found.')

        entity = results[0]

        # basic docket metadata
        out = {
            'name': entity.aliases[0],
            'url': reverse('entity-view', args=args, kwargs=kwargs),
            'id': entity.id,
            'type': entity.td_type,
            'stats': entity.stats
        }

        stats = entity.stats
        if stats:
            # cleanup, plus stitch on some additional data
            now = datetime.datetime.now().date()
            for mention_type in ["text_mentions", "submitter_mentions"]:
                stats[mention_type].update({
                    'months': [month for month in prettify_months(stats[mention_type]['months']) if month['date_range'][0] <= now] if stats[mention_type]['months'] else [],
                })

                # limit ourselves to the top ten of each match type, and grab their extra metadata
                agencies = sorted(stats[mention_type]['agencies'].items(), key=lambda x: x[1], reverse=True)[:10]

                stats[mention_type]['top_agencies'] = [{
                    'id': item[0],
                    'count': item[1],
                    'months': prettify_months(stats[mention_type]['agencies_by_month'][item[0]])
                } for item in agencies]
                del stats[mention_type]['agencies'], stats[mention_type]['agencies_by_month']

                docket_list = stats[mention_type]['dockets'].items()
                years = request.GET.get('years', None)
                if years:
                    year_set = set(years.split(","))
                    docket_list = [item for item in docket_list if get_docket_year(item[0]) in year_set]
                dockets = sorted(docket_list, key=lambda x: x[1], reverse=True)[:10]

                stats[mention_type]['top_dockets'] = [{
                    'id': item[0],
                    'count': item[1]
                } for item in dockets]

                stats[mention_type]['docket_count'] = len(docket_list)
                del stats[mention_type]['dockets']

                stats[mention_type]['docket_search_url'] = "/search-docket/" + url_quote(":".join(["mentioned" if mention_type == "text_mentions" else "submitter", entity.id, '"%s"' % entity.aliases[0]]))

            # grab additional docket metadata
            ids = list(set([record['id'] for record in stats['submitter_mentions']['top_dockets']] + [record['id'] for record in stats['text_mentions']['top_dockets']]))
            dockets_search = Docket.objects(id__in=ids).only('id', 'title', 'year', 'details.dk_type', 'agency', 'stats.date_range')
            dockets = dict([(docket.id, docket) for docket in dockets_search])

            # stitch this back onto the main records
            for mention_type in ['text_mentions', 'submitter_mentions']:
                for docket in stats[mention_type]['top_dockets']:
                    rdocket = dockets[docket['id']]
                    docket.update({
                        'title': rdocket.title,
                        'url': reverse('docket-view', kwargs={'docket_id': rdocket.id}),
                        'year': rdocket.year if rdocket.year else (getattr(rdocket.stats['date_range'][0], 'year', None) if 'date_range' in rdocket.stats else None),
                        'rulemaking': rdocket.details.get('Type', 'Nonrulemaking').lower() == 'rulemaking',
                        'agency': rdocket.agency if rdocket.agency else re.split("[-_]", rdocket.id)[0]
                    })

            # repeat for agencies
            ids = list(set([record['id'] for record in stats['submitter_mentions']['top_agencies']] + [record['id'] for record in stats['text_mentions']['top_agencies']]))
            agencies_search = Agency.objects(id__in=ids).only('id', 'name')
            agencies = dict([(agency.id, agency) for agency in agencies_search])

            # ...and stitch
            for mention_type in ['text_mentions', 'submitter_mentions']:
                for agency in stats[mention_type]['top_agencies']:
                    ragency = agencies.get(agency['id'], None)
                    agency.update({
                        'name': ragency.name if ragency else agency['id'],
                        'url': '/agency/%s' % agency['id']
                    })

            # and for comments
            recent_comments = []
            if 'recent_comments' in stats['submitter_mentions']:
                recent_comments_search = Doc.objects(id__in=[doc['id'] for doc in stats['submitter_mentions']['recent_comments']]).only('id', 'title', 'details')
                for comment in recent_comments_search:
                    comment_item = {
                        'title': comment.title,
                        'date': comment.details['Date_Posted'].date().isoformat() if 'Date_Posted' in comment.details else None,
                        'author': " ".join([comment.details.get('First_Name', ''), comment.details.get('Last_Name', '')]).strip(),
                        'organization': comment.details.get('Organization_Name', ''),
                        'url': '/document/' + comment.id
                    }
                    comment_item['author'] = comment_item['author'] if comment_item['author'] else None
                    recent_comments.append(comment_item)

            stats['submitter_mentions']['recent_comments'] = recent_comments

            out['stats'] = stats
        else:
            out['stats'] = {'count': 0}

        return Response(out)

Esempio n. 9

0

Mostra file

File: views.py Progetto: apendleton/sparerib

    def get(self, request, *args, **kwargs):
        "Access basic metadata about regulations.gov documents."
        results = list(Doc.objects(id=kwargs['document_id']))
        if not results or results[0].deleted:
            raise Http404('Document not found.')

        document = results[0]

        # basic document metadata
        out = {
            'title': document.title,
            'url': reverse('document-view', kwargs=kwargs),
            'id': document.id,

            'agency': {
                'id': document.agency,
                'url': reverse('agency-view', kwargs={'agency': document.agency}),
                'name': Agency.objects(id=document.agency).only("name")[0].name
            },
            'date': document.details.get('Date_Posted', None),
            'type': document.type,
            'views': [],
            'attachments': [],
            'details': document.details if document.details else {}
        }

        # inter-dataset suppression
        if 'replaced_by' in document.suppression:
            new_kwargs = dict(kwargs)
            new_kwargs['document_id'] = document.suppression['replaced_by'][0]
            out['redirect_to'] = reverse('document-view', kwargs=new_kwargs)

        # comment-on metadata
        if document.comment_on:
            # if we don't have all the data built in, grab it from its original record
            comment_on_doc = document.comment_on if 'title' in document.comment_on else Doc.objects.get(id=document.comment_on['document_id']).to_mongo()
            out['comment_on'] = {
                "fr_doc": comment_on_doc.get('fr_doc', False),  
                "type": comment_on_doc.get('type', None), 
                "id": document.comment_on['document_id'],
                'url': reverse('document-view', kwargs={'document_id': document.comment_on['document_id']}),
                "title": comment_on_doc['title']
            }
            if comment_on_doc['agency'] == out['agency']['id'] or not comment_on_doc['agency']:
                out['comment_on']['agency'] = out['agency']
            else:
                out['comment_on']['agency'] = {
                    'id': comment_on_doc['agency'],
                    'url': reverse('agency-view', kwargs={'agency': comment_on_doc['agency']}),
                    'name': Agency.objects(id=comment_on_doc['agency']).only("name")[0].name
                }
        else:
            out['comment_on'] = {}

        # docket metadata
        docket = Docket.objects(id=document.docket_id)[0]
        out['docket'] = {
            'id': document.docket_id,
            'url': reverse('docket-view', kwargs={'docket_id': document.docket_id}),
            'title': docket.title,
            'weeks': [],
            'fr_docs': []
        }
        if docket.stats:
            out['docket']['weeks'] = prettify_weeks(docket.stats['weeks'])
            out['docket']['fr_docs'] = docket.stats['doc_info'].get('fr_docs', [])

        if out['date']:
            out['date'] = out['date'].isoformat()

        text_entities = set()
        submitter_entities = set(document.submitter_entities if document.submitter_entities else [])
        
        # a weird thing happens with iterating over mongoengine lists where they lose references to their parent instances, so do this weird generator thing
        for view in (document.views[i] for i in xrange(len(document.views))):
            # hack to deal with documents whose scrapes failed but still got extracted
            object_id = document.object_id if document.object_id else view.file_path.split('/')[-1].split('.')[0]
            out['views'].append({
                'object_id': object_id,
                'file_type': view.type,
                'file_type_label': TYPE_LABELS.get(view.type, view.type.upper()),
                'extracted': view.extracted == 'yes',
                'url': view.download_url,
                'html': reverse('raw-text-view', kwargs={'document_id': document.id, 'file_type': view.type, 'output_format': 'html', 'view_type': 'view'}) if view.extracted == 'yes' else None
            })

            for entity in view.entities:
                text_entities.add(entity)

        for attachment in (document.attachments[i] for i in xrange(len(document.attachments))):
            a = {
                'title': attachment.title,
                'views': []
            }
            for view in (attachment.views[i] for i in xrange(len(attachment.views))):
                a['views'].append({
                    'object_id': attachment.object_id,
                    'file_type': view.type,
                    'file_type_label': TYPE_LABELS.get(view.type, view.type.upper()),
                    'extracted': view.extracted == 'yes',
                    'url': view.download_url,
                    'html': reverse('raw-text-view', kwargs={'document_id': document.id, 'object_id': attachment.object_id, 'file_type': view.type, 'output_format': 'html', 'view_type': 'attachment'}) if view.extracted == 'yes' else None
                })

                for entity in view.entities:
                    text_entities.add(entity)
            out['attachments'].append(a)

        # stats for FR docs
        stats = document.stats if document.stats else {'count': 0}
        # limit ourselves to the top five of each match type, and grab their extra metadata
        for label in ['text_entities', 'submitter_entities']:
            stats['top_' + label] = [{
                'id': i[0],
                'count': i[1]
            } for i in sorted(stats.get(label, {}).items(), key=lambda x: x[1], reverse=True)[:5]]
            if label in stats:
                del stats[label]
        top_entities = set([record['id'] for record in stats['top_text_entities']] + [record['id'] for record in stats['top_submitter_entities']])

        entities_search = Entity.objects(id__in=list(submitter_entities.union(text_entities, top_entities))).only('id', 'td_type', 'aliases')
        entities = dict([(entity.id, entity) for entity in entities_search])

        for label, items in [('submitter_entities', sorted(list(submitter_entities))), ('text_entities', sorted(list(text_entities)))]:
            out[label] = [{
                'id': item,
                'type': entities[item].td_type,
                'name': entities[item].aliases[0],
                'url': '/%s/%s/%s' % (entities[item].td_type, slugify(entities[item].aliases[0]), item)
            } for item in items]

        for label in ['top_text_entities', 'top_submitter_entities']:
            for entity in stats[label]:
                if not entities[entity['id']].td_type:
                    continue
                
                entity['type'] = entities[entity['id']].td_type
                entity['name'] = entities[entity['id']].aliases[0]
                entity['url'] = '/%s/%s/%s' % (entity['type'], slugify(entity['name']), entity['id'])

        if 'weeks' in stats:
            stats['weeks'] = prettify_weeks(stats['weeks'])

        recent_comments = []
        if 'recent_comments' in stats:
            recent_comments_search = Doc.objects(id__in=[doc['id'] for doc in stats['recent_comments']]).only('id', 'title', 'details')
            for comment in recent_comments_search:
                comment_item = {
                    'title': comment.title,
                    'date': comment.details['Date_Posted'].date().isoformat() if 'Date_Posted' in comment.details else None,
                    'author': " ".join([comment.details.get('First_Name', ''), comment.details.get('Last_Name', '')]).strip(),
                    'organization': comment.details.get('Organization_Name', ''),
                    'url': '/document/' + comment.id
                }
                comment_item['author'] = comment_item['author'] if comment_item['author'] else None
                recent_comments.append(comment_item)

        stats['recent_comments'] = recent_comments

        out['comment_stats'] = stats

        # links upstream
        out['source'] = document.source
        out['upstream_urls'] = []
        if out['source'] == 'regulations.gov':
            out['upstream_urls'].append({
                'url': 'http://www.regulations.gov/#!documentDetail;D=' + document.id,
                'label': 'Regulations.gov'
            })
        elif out['source'] == 'sec_cftc':
            for replaced in document.suppression.get('replaces', []):
                out['upstream_urls'].append({
                    'url': 'http://www.regulations.gov/#!documentDetail;D=' + replaced,
                    'label': 'Regulations.gov'
                })

        # cleaned-up details
        details = out['details'].copy()
        dp = lambda key, default=None: details.pop(key, default)
        out['clean_details'] = dtls(
            ('Submitter Information', dtls(
                ('Name', combine(dp('First_Name'), dp('Middle_Name'), dp('Last_Name'))),
                ('Organization', dp('Organization_Name')),
                ('Location', combine(dp('Mailing_Address'), dp('Mailing_Address_'), dp('City'), expand_state(dp('State_or_Province')), dp('Postal_Code'), dp('Country'), sep=", ")),
                ('Email Address', dp('Email_Address')),
                ('Phone Number', dp('Phone_Number')),
                ('Fax Number', dp('Fax_Number')),
                ("Submitter's Representative", dp('Submitter_s_Representative'))
            )),

            ('Dates and Times', dtls(
                ('Document Date', dp('Document_Date')), # rarely-used
                ('Date Received', dp('Received_Date')),
                ('Postmark Date', dp('Postmark_Date', dp('Post_Mark_Date'))),
                ('Date Posted', dp('Date_Posted')),
                (None, dp('Date')), # Swallow this one, since it's always the same as Date_Posted,
                ('Comment Period', combine(
                    short_date(force_date(dp('Comment_Start_Date'))),
                    short_date(force_date(dp('Comment_Due_Date'))),
                    sep="&ndash;"
                )),

                # all the other dates -- don't even know what most of these are
                ("File Date", dp("File_Date")),
                ("Answer Date", dp("Answer_Date")),
                ("Author Date", dp("Author_Date")),
                ("Author Document Date", dp("Author_Document_Date")),
                ("Effective Date", dp("Effective_Date")),
                ("Implementation Date", dp("Implementation_Date")),
                ("Implementation Service Date", dp("Implementation_Service_Date"))
            )),
            
            ('Citations and References', dtls(
                ("RIN", document.rin if document.rin else None),
                ("Federal Register No.", dp("Federal_Register_Number")),
                ("Federal Register Pages", dp("Start_End_Page", "").replace(" - ", "&ndash;")),
                (None, dp("Page_Count")), # who cares?
                (None, dp("Page_Start")), # who cares?
                ("Federal Register Citation", dp("Federal_Register_Citation")),
                ("CFR Section(s)", dp("CFR")),
                ("Related RINs", dp("Related_RIN_s_")),
            )),
            
            ('Additional Details', dtls(*details.items()))
        )

        return Response(out)

Esempio n. 10

0

Mostra file

File: views.py Progetto: apendleton/sparerib

    def get(self, request, *args, **kwargs):
        out = super(DocketView, self).get(request, *args, **kwargs).data

        out['source'] = self.item.source
        stats = out['stats']
        stats['similar_dockets'] = []
        summaries = []

        out['upstream_urls'] = []
        if out['source'] == 'regulations.gov':
            out['upstream_urls'].append({
                'url': 'http://www.regulations.gov/#!docketDetail;D=' + self.item.id,
                'label': 'Regulations.gov'
            })
        elif out['source'] == 'sec_cftc':
            if 'Source_URL' in self.item.details:
                out['upstream_urls'].append({
                    'url': self.item.details['Source_URL'],
                    'label': 'SEC.gov' if self.item.agency == 'SEC' else 'CFTC.gov'
                })
            for replaced in self.item.suppression.get('replaces', []):
                out['upstream_urls'].append({
                    'url': 'http://www.regulations.gov/#!docketDetail;D=' + replaced,
                    'label': 'Regulations.gov'
                })


        if stats['count'] > 0:
            # do a similar thing with FR documents
            if stats.get('doc_info', {}).get('fr_docs', None):
                fr_doc_ids = [doc['id'] for doc in stats['doc_info']['fr_docs']]
                fr_search = Doc.objects(id__in=fr_doc_ids)
                fr_docs = dict([(fr_doc.id, fr_doc) for fr_doc in fr_search])

                for doc in stats['doc_info']['fr_docs']:
                    if doc['id'] in fr_docs:
                        fr_doc = fr_docs[doc['id']]
                        doc['stats'] = {
                            'date_range': fr_doc.stats['date_range'],
                            'count': fr_doc.stats['count']
                        } if fr_doc.stats else {'count': 0}
                        
                        if fr_doc.annotations.get('fr_data', None):
                            doc['summary'] = fr_doc.annotations['fr_data'].get('abstract', None)

                        if not doc.get('summary', None):
                            doc['summary'] = fr_doc.get_summary()

                        doc['comments_open'] = 'Comment_Due_Date' in fr_doc.details and force_date(fr_doc.details['Comment_Due_Date']) > datetime.datetime.now()

                        if doc['summary']:
                            summaries.append(doc['summary'])
                    else:
                        doc['stats'] = {'count': 0, 'comments_open': False}
                        doc['summary'] = None

                # remove duplicates, if any
                tmp = stats['doc_info']['fr_docs']
                included = set()
                stats['doc_info']['fr_docs'] = []
                for doc in tmp:
                    if doc['id'] not in included:
                        stats['doc_info']['fr_docs'].append(doc)
                        included.add(doc['id'])

            summary_text = "\n".join(summaries)
            if summary_text:
                similar_dockets = get_similar_dockets(summary_text, kwargs[self.aggregation_field])[:3]
                if similar_dockets:
                    sd = dict([(docket.id, docket.title) for docket in Docket.objects(id__in=similar_dockets).only('id', 'title')])
                    stats['similar_dockets'] = [{
                        'id': docket,
                        'title': sd[docket]
                    } for docket in similar_dockets]

        agency = self.item.agency
        if not agency:
            agency = re.split("[-_]", self.item.id)[0]
        
        if agency:
            agency_meta = list(Agency.objects(id=agency).only("name"))
            if agency_meta:
                out['agency'] = {
                    'id': agency,
                    'name': agency_meta[0].name,
                    'url': '/agency/%s' % agency
                }
            else:
                agency = None
        
        if not agency:
            out['agency'] = None

        return Response(out)

Esempio n. 11

0

Mostra file

File: views.py Progetto: thecapacity/sparerib

    def get(self, request, entity_id, docket_id, document_type, entity_type):
        dkt_results = list(Docket.objects(id=docket_id).only('id', 'title'))
        ent_results = list(Entity.objects(id=entity_id).only('id', 'aliases'))
        if not dkt_results or not ent_results:
            raise Http404('Not found.')

        docket = dkt_results[0]
        entity = ent_results[0]

        if document_type == 'mentions':
            docs_q = Doc.objects(Q(attachments__views__entities=entity_id)
                                 | Q(views__entities=entity_id),
                                 docket_id=docket_id)
        else:
            docs_q = Doc.objects(submitter_entities=entity_id, docket_id=docket_id) \

        docs_q = docs_q.only('type', 'title', 'id', 'views',
                             'attachments.views', 'details.Date_Posted',
                             'deleted').hint([("docket_id", 1)])
        docs = filter(
            lambda d: not d.deleted,
            sorted(list(docs_q),
                   key=lambda doc: doc.details.get(
                       'Date_Posted', datetime.datetime(1900, 1, 1)),
                   reverse=True))

        get_views = lambda doc: [
            {
                'object_id': view.object_id,
                'file_type': view.type,
                'url': view.url.replace('inline', 'attachment')
            } for view in doc.views if entity_id in view.entities
        ]

        out_docs = []
        for doc in docs[:10]:
            out_doc = {
                'title': doc.title,
                'id': doc.id,
                'date_posted': doc.details['Date_Posted'],
                'type': doc.type,
                'url': '/document/' + doc.id
            }
            if document_type == 'mentions':
                out_doc['files'] = get_views(doc) + list(
                    itertools.chain.from_iterable([
                        get_views(attachment) for attachment in doc.attachments
                    ]))

            out_docs.append(out_doc)

        return Response({
            'documents': out_docs,
            'has_more': len(docs) > 10,
            'count': len(docs),
            'document_search_url': "/search-document/" + \
                url_quote(":".join(["mentioned" if document_type == "mentions" else "submitter", entity.id, '"%s"' % entity.aliases[0]])) + \
                url_quote(":".join(["docket", docket.id, '"%s"' % docket.title])),
            'docket': {
                'id': docket.id,
                'title': docket.title,
            },
            'entity': {
                'id': entity.id,
                'name': entity.aliases[0]
            },
            'filter_type': document_type
        })

Esempio n. 12

0

Mostra file

File: views.py Progetto: thecapacity/sparerib

    def get(self, request, *args, **kwargs):
        "Access aggregate information about entities as they occur in regulations.gov data."
        results = Entity.objects(id=kwargs['entity_id'])
        if not results:
            raise Http404('Docket not found.')

        entity = results[0]

        # basic docket metadata
        out = {
            'name': entity.aliases[0],
            'url': reverse('entity-view', args=args, kwargs=kwargs),
            'id': entity.id,
            'type': entity.td_type,
            'stats': entity.stats
        }

        stats = entity.stats
        if stats:
            # cleanup, plus stitch on some additional data
            now = datetime.datetime.now().date()
            for mention_type in ["text_mentions", "submitter_mentions"]:
                stats[mention_type].update({
                    'months': [
                        month for month in prettify_months(stats[mention_type]
                                                           ['months'])
                        if month['date_range'][0] <= now
                    ] if stats[mention_type]['months'] else [],
                })

                # limit ourselves to the top ten of each match type, and grab their extra metadata
                agencies = sorted(stats[mention_type]['agencies'].items(),
                                  key=lambda x: x[1],
                                  reverse=True)[:10]

                stats[mention_type]['top_agencies'] = [{
                    'id':
                    item[0],
                    'count':
                    item[1],
                    'months':
                    prettify_months(
                        stats[mention_type]['agencies_by_month'][item[0]])
                } for item in agencies]
                del stats[mention_type]['agencies'], stats[mention_type][
                    'agencies_by_month']

                docket_list = stats[mention_type]['dockets'].items()
                years = request.GET.get('years', None)
                if years:
                    year_set = set(years.split(","))
                    docket_list = [
                        item for item in docket_list
                        if get_docket_year(item[0]) in year_set
                    ]
                dockets = sorted(docket_list, key=lambda x: x[1],
                                 reverse=True)[:10]

                stats[mention_type]['top_dockets'] = [{
                    'id': item[0],
                    'count': item[1]
                } for item in dockets]

                stats[mention_type]['docket_count'] = len(docket_list)
                del stats[mention_type]['dockets']

                stats[mention_type][
                    'docket_search_url'] = "/search-docket/" + url_quote(
                        ":".join([
                            "mentioned" if mention_type == "text_mentions" else
                            "submitter", entity.id,
                            '"%s"' % entity.aliases[0]
                        ]))

            # grab additional docket metadata
            ids = list(
                set([
                    record['id']
                    for record in stats['submitter_mentions']['top_dockets']
                ] + [
                    record['id']
                    for record in stats['text_mentions']['top_dockets']
                ]))
            dockets_search = Docket.objects(id__in=ids).only(
                'id', 'title', 'year', 'details.dk_type', 'agency',
                'stats.date_range')
            dockets = dict([(docket.id, docket) for docket in dockets_search])

            # stitch this back onto the main records
            for mention_type in ['text_mentions', 'submitter_mentions']:
                for docket in stats[mention_type]['top_dockets']:
                    rdocket = dockets[docket['id']]
                    docket.update({
                        'title':
                        rdocket.title,
                        'url':
                        reverse('docket-view',
                                kwargs={'docket_id': rdocket.id}),
                        'year':
                        rdocket.year if rdocket.year else
                        (getattr(rdocket.stats['date_range'][0], 'year', None)
                         if 'date_range' in rdocket.stats else None),
                        'rulemaking':
                        rdocket.details.get(
                            'Type', 'Nonrulemaking').lower() == 'rulemaking',
                        'agency':
                        rdocket.agency if rdocket.agency else re.split(
                            "[-_]", rdocket.id)[0]
                    })

            # repeat for agencies
            ids = list(
                set([
                    record['id']
                    for record in stats['submitter_mentions']['top_agencies']
                ] + [
                    record['id']
                    for record in stats['text_mentions']['top_agencies']
                ]))
            agencies_search = Agency.objects(id__in=ids).only('id', 'name')
            agencies = dict([(agency.id, agency)
                             for agency in agencies_search])

            # ...and stitch
            for mention_type in ['text_mentions', 'submitter_mentions']:
                for agency in stats[mention_type]['top_agencies']:
                    ragency = agencies.get(agency['id'], None)
                    agency.update({
                        'name':
                        ragency.name if ragency else agency['id'],
                        'url':
                        '/agency/%s' % agency['id']
                    })

            # and for comments
            recent_comments = []
            if 'recent_comments' in stats['submitter_mentions']:
                recent_comments_search = Doc.objects(id__in=[
                    doc['id']
                    for doc in stats['submitter_mentions']['recent_comments']
                ]).only('id', 'title', 'details')
                for comment in recent_comments_search:
                    comment_item = {
                        'title':
                        comment.title,
                        'date':
                        comment.details['Date_Posted'].date().isoformat()
                        if 'Date_Posted' in comment.details else None,
                        'author':
                        " ".join([
                            comment.details.get('First_Name', ''),
                            comment.details.get('Last_Name', '')
                        ]).strip(),
                        'organization':
                        comment.details.get('Organization_Name', ''),
                        'url':
                        '/document/' + comment.id
                    }
                    comment_item['author'] = comment_item[
                        'author'] if comment_item['author'] else None
                    recent_comments.append(comment_item)

            stats['submitter_mentions']['recent_comments'] = recent_comments

            out['stats'] = stats
        else:
            out['stats'] = {'count': 0}

        return Response(out)

Esempio n. 13

0

Mostra file

File: views.py Progetto: thecapacity/sparerib

    def get(self, request, *args, **kwargs):
        "Access basic metadata about regulations.gov documents."
        results = list(Doc.objects(id=kwargs['document_id']))
        if not results or results[0].deleted:
            raise Http404('Document not found.')

        document = results[0]

        # basic document metadata
        out = {
            'title': document.title,
            'url': reverse('document-view', kwargs=kwargs),
            'id': document.id,
            'agency': {
                'id': document.agency,
                'url': reverse('agency-view',
                               kwargs={'agency': document.agency}),
                'name': Agency.objects(id=document.agency).only("name")[0].name
            },
            'date': document.details.get('Date_Posted', None),
            'type': document.type,
            'views': [],
            'attachments': [],
            'details': document.details if document.details else {}
        }

        # inter-dataset suppression
        if 'replaced_by' in document.suppression:
            new_kwargs = dict(kwargs)
            new_kwargs['document_id'] = document.suppression['replaced_by'][0]
            out['redirect_to'] = reverse('document-view', kwargs=new_kwargs)

        # comment-on metadata
        if document.comment_on:
            # if we don't have all the data built in, grab it from its original record
            comment_on_doc = document.comment_on if 'title' in document.comment_on else Doc.objects.get(
                id=document.comment_on['document_id']).to_mongo()
            out['comment_on'] = {
                "fr_doc":
                comment_on_doc.get('fr_doc', False),
                "type":
                comment_on_doc.get('type', None),
                "id":
                document.comment_on['document_id'],
                'url':
                reverse(
                    'document-view',
                    kwargs={'document_id':
                            document.comment_on['document_id']}),
                "title":
                comment_on_doc['title']
            }
            if comment_on_doc['agency'] == out['agency'][
                    'id'] or not comment_on_doc['agency']:
                out['comment_on']['agency'] = out['agency']
            else:
                out['comment_on']['agency'] = {
                    'id':
                    comment_on_doc['agency'],
                    'url':
                    reverse('agency-view',
                            kwargs={'agency': comment_on_doc['agency']}),
                    'name':
                    Agency.objects(
                        id=comment_on_doc['agency']).only("name")[0].name
                }
        else:
            out['comment_on'] = {}

        # docket metadata
        docket = Docket.objects(id=document.docket_id)[0]
        out['docket'] = {
            'id': document.docket_id,
            'url': reverse('docket-view',
                           kwargs={'docket_id': document.docket_id}),
            'title': docket.title,
            'weeks': [],
            'fr_docs': []
        }
        if docket.stats:
            out['docket']['weeks'] = prettify_weeks(docket.stats['weeks'])
            out['docket']['fr_docs'] = docket.stats['doc_info'].get(
                'fr_docs', [])

        if out['date']:
            out['date'] = out['date'].isoformat()

        text_entities = set()
        submitter_entities = set(
            document.submitter_entities if document.submitter_entities else [])

        # a weird thing happens with iterating over mongoengine lists where they lose references to their parent instances, so do this weird generator thing
        for view in (document.views[i] for i in xrange(len(document.views))):
            # hack to deal with documents whose scrapes failed but still got extracted
            object_id = document.object_id if document.object_id else view.file_path.split(
                '/')[-1].split('.')[0]
            out['views'].append({
                'object_id':
                object_id,
                'file_type':
                view.type,
                'file_type_label':
                TYPE_LABELS.get(view.type, view.type.upper()),
                'extracted':
                view.extracted == 'yes',
                'url':
                view.download_url,
                'html':
                reverse('raw-text-view',
                        kwargs={
                            'document_id': document.id,
                            'file_type': view.type,
                            'output_format': 'html',
                            'view_type': 'view'
                        }) if view.extracted == 'yes' else None
            })

            for entity in view.entities:
                text_entities.add(entity)

        for attachment in (document.attachments[i]
                           for i in xrange(len(document.attachments))):
            a = {'title': attachment.title, 'views': []}
            for view in (attachment.views[i]
                         for i in xrange(len(attachment.views))):
                a['views'].append({
                    'object_id':
                    attachment.object_id,
                    'file_type':
                    view.type,
                    'file_type_label':
                    TYPE_LABELS.get(view.type, view.type.upper()),
                    'extracted':
                    view.extracted == 'yes',
                    'url':
                    view.download_url,
                    'html':
                    reverse('raw-text-view',
                            kwargs={
                                'document_id': document.id,
                                'object_id': attachment.object_id,
                                'file_type': view.type,
                                'output_format': 'html',
                                'view_type': 'attachment'
                            }) if view.extracted == 'yes' else None
                })

                for entity in view.entities:
                    text_entities.add(entity)
            out['attachments'].append(a)

        # stats for FR docs
        stats = document.stats if document.stats else {'count': 0}
        # limit ourselves to the top five of each match type, and grab their extra metadata
        for label in ['text_entities', 'submitter_entities']:
            stats['top_' + label] = [{
                'id': i[0],
                'count': i[1]
            } for i in sorted(stats.get(label, {}).items(),
                              key=lambda x: x[1],
                              reverse=True)[:5]]
            if label in stats:
                del stats[label]
        top_entities = set(
            [record['id'] for record in stats['top_text_entities']] +
            [record['id'] for record in stats['top_submitter_entities']])

        entities_search = Entity.objects(id__in=list(
            submitter_entities.union(text_entities, top_entities))).only(
                'id', 'td_type', 'aliases')
        entities = dict([(entity.id, entity) for entity in entities_search])

        for label, items in [('submitter_entities',
                              sorted(list(submitter_entities))),
                             ('text_entities', sorted(list(text_entities)))]:
            out[label] = [{
                'id':
                item,
                'type':
                entities[item].td_type,
                'name':
                entities[item].aliases[0],
                'url':
                '/%s/%s/%s' % (entities[item].td_type,
                               slugify(entities[item].aliases[0]), item)
            } for item in items]

        for label in ['top_text_entities', 'top_submitter_entities']:
            for entity in stats[label]:
                if not entities[entity['id']].td_type:
                    continue

                entity['type'] = entities[entity['id']].td_type
                entity['name'] = entities[entity['id']].aliases[0]
                entity['url'] = '/%s/%s/%s' % (
                    entity['type'], slugify(entity['name']), entity['id'])

        if 'weeks' in stats:
            stats['weeks'] = prettify_weeks(stats['weeks'])

        recent_comments = []
        if 'recent_comments' in stats:
            recent_comments_search = Doc.objects(
                id__in=[doc['id'] for doc in stats['recent_comments']]).only(
                    'id', 'title', 'details')
            for comment in recent_comments_search:
                comment_item = {
                    'title':
                    comment.title,
                    'date':
                    comment.details['Date_Posted'].date().isoformat()
                    if 'Date_Posted' in comment.details else None,
                    'author':
                    " ".join([
                        comment.details.get('First_Name', ''),
                        comment.details.get('Last_Name', '')
                    ]).strip(),
                    'organization':
                    comment.details.get('Organization_Name', ''),
                    'url':
                    '/document/' + comment.id
                }
                comment_item['author'] = comment_item[
                    'author'] if comment_item['author'] else None
                recent_comments.append(comment_item)

        stats['recent_comments'] = recent_comments

        out['comment_stats'] = stats

        # links upstream
        out['source'] = document.source
        out['upstream_urls'] = []
        if out['source'] == 'regulations.gov':
            out['upstream_urls'].append({
                'url':
                'http://www.regulations.gov/#!documentDetail;D=' + document.id,
                'label':
                'Regulations.gov'
            })
        elif out['source'] == 'sec_cftc':
            for replaced in document.suppression.get('replaces', []):
                out['upstream_urls'].append({
                    'url':
                    'http://www.regulations.gov/#!documentDetail;D=' +
                    replaced,
                    'label':
                    'Regulations.gov'
                })

        # cleaned-up details
        details = out['details'].copy()
        dp = lambda key, default=None: details.pop(key, default)
        out['clean_details'] = dtls(
            ('Submitter Information',
             dtls(('Name',
                   combine(dp('First_Name'), dp('Middle_Name'),
                           dp('Last_Name'))),
                  ('Organization', dp('Organization_Name')),
                  ('Location',
                   combine(dp('Mailing_Address'),
                           dp('Mailing_Address_'),
                           dp('City'),
                           expand_state(dp('State_or_Province')),
                           dp('Postal_Code'),
                           dp('Country'),
                           sep=", ")), ('Email Address', dp('Email_Address')),
                  ('Phone Number', dp('Phone_Number')),
                  ('Fax Number', dp('Fax_Number')),
                  ("Submitter's Representative",
                   dp('Submitter_s_Representative')))),
            (
                'Dates and Times',
                dtls(
                    ('Document Date', dp('Document_Date')),  # rarely-used
                    ('Date Received', dp('Received_Date')),
                    ('Postmark Date', dp('Postmark_Date',
                                         dp('Post_Mark_Date'))),
                    ('Date Posted', dp('Date_Posted')),
                    (
                        None, dp('Date')
                    ),  # Swallow this one, since it's always the same as Date_Posted,
                    ('Comment Period',
                     combine(short_date(force_date(dp('Comment_Start_Date'))),
                             short_date(force_date(dp('Comment_Due_Date'))),
                             sep="&ndash;")),

                    # all the other dates -- don't even know what most of these are
                    ("File Date", dp("File_Date")),
                    ("Answer Date", dp("Answer_Date")),
                    ("Author Date", dp("Author_Date")),
                    ("Author Document Date", dp("Author_Document_Date")),
                    ("Effective Date", dp("Effective_Date")),
                    ("Implementation Date", dp("Implementation_Date")),
                    ("Implementation Service Date",
                     dp("Implementation_Service_Date")))),
            (
                'Citations and References',
                dtls(
                    ("RIN", document.rin if document.rin else None),
                    ("Federal Register No.", dp("Federal_Register_Number")),
                    ("Federal Register Pages", dp(
                        "Start_End_Page", "").replace(" - ", "&ndash;")),
                    (None, dp("Page_Count")),  # who cares?
                    (None, dp("Page_Start")),  # who cares?
                    ("Federal Register Citation",
                     dp("Federal_Register_Citation")),
                    ("CFR Section(s)", dp("CFR")),
                    ("Related RINs", dp("Related_RIN_s_")),
                )),
            ('Additional Details', dtls(*details.items())))

        return Response(out)

Esempio n. 14

0

Mostra file

File: views.py Progetto: thecapacity/sparerib

    def get(self, request, *args, **kwargs):
        out = super(DocketView, self).get(request, *args, **kwargs).data

        out['source'] = self.item.source
        stats = out['stats']
        stats['similar_dockets'] = []
        summaries = []

        out['upstream_urls'] = []
        if out['source'] == 'regulations.gov':
            out['upstream_urls'].append({
                'url':
                'http://www.regulations.gov/#!docketDetail;D=' + self.item.id,
                'label':
                'Regulations.gov'
            })
        elif out['source'] == 'sec_cftc':
            if 'Source_URL' in self.item.details:
                out['upstream_urls'].append({
                    'url':
                    self.item.details['Source_URL'],
                    'label':
                    'SEC.gov' if self.item.agency == 'SEC' else 'CFTC.gov'
                })
            for replaced in self.item.suppression.get('replaces', []):
                out['upstream_urls'].append({
                    'url':
                    'http://www.regulations.gov/#!docketDetail;D=' + replaced,
                    'label':
                    'Regulations.gov'
                })

        if stats['count'] > 0:
            # do a similar thing with FR documents
            if stats.get('doc_info', {}).get('fr_docs', None):
                fr_doc_ids = [
                    doc['id'] for doc in stats['doc_info']['fr_docs']
                ]
                fr_search = Doc.objects(id__in=fr_doc_ids)
                fr_docs = dict([(fr_doc.id, fr_doc) for fr_doc in fr_search])

                for doc in stats['doc_info']['fr_docs']:
                    if doc['id'] in fr_docs:
                        fr_doc = fr_docs[doc['id']]
                        doc['stats'] = {
                            'date_range': fr_doc.stats['date_range'],
                            'count': fr_doc.stats['count']
                        } if fr_doc.stats else {
                            'count': 0
                        }

                        if fr_doc.annotations.get('fr_data', None):
                            doc['summary'] = fr_doc.annotations['fr_data'].get(
                                'abstract', None)

                        if not doc.get('summary', None):
                            doc['summary'] = fr_doc.get_summary()

                        doc['comments_open'] = 'Comment_Due_Date' in fr_doc.details and force_date(
                            fr_doc.details['Comment_Due_Date']
                        ) > datetime.datetime.now()

                        if doc['summary']:
                            summaries.append(doc['summary'])
                    else:
                        doc['stats'] = {'count': 0, 'comments_open': False}
                        doc['summary'] = None

                # remove duplicates, if any
                tmp = stats['doc_info']['fr_docs']
                included = set()
                stats['doc_info']['fr_docs'] = []
                for doc in tmp:
                    if doc['id'] not in included:
                        stats['doc_info']['fr_docs'].append(doc)
                        included.add(doc['id'])

            summary_text = "\n".join(summaries)
            if summary_text:
                similar_dockets = get_similar_dockets(
                    summary_text, kwargs[self.aggregation_field])[:3]
                if similar_dockets:
                    sd = dict([(docket.id, docket.title)
                               for docket in Docket.objects(
                                   id__in=similar_dockets).only('id', 'title')
                               ])
                    stats['similar_dockets'] = [{
                        'id': docket,
                        'title': sd[docket]
                    } for docket in similar_dockets]

        agency = self.item.agency
        if not agency:
            agency = re.split("[-_]", self.item.id)[0]

        if agency:
            agency_meta = list(Agency.objects(id=agency).only("name"))
            if agency_meta:
                out['agency'] = {
                    'id': agency,
                    'name': agency_meta[0].name,
                    'url': '/agency/%s' % agency
                }
            else:
                agency = None

        if not agency:
            out['agency'] = None

        return Response(out)

Esempio n. 15

0

Mostra file

File: views.py Progetto: imclab/sparerib

    def get(self, request, *args, **kwargs):
        out = super(DocketView, self).get(request, *args, **kwargs).data

        stats = out['stats']
        stats['similar_dockets'] = []
        summaries = []

        if stats['count'] > 0:
            # do a similar thing with FR documents
            if stats.get('doc_info', {}).get('fr_docs', None):
                fr_doc_ids = [doc['id'] for doc in stats['doc_info']['fr_docs']]
                fr_search = Doc.objects(id__in=fr_doc_ids)
                fr_docs = dict([(fr_doc.id, fr_doc) for fr_doc in fr_search])

                for doc in stats['doc_info']['fr_docs']:
                    if doc['id'] in fr_docs:
                        fr_doc = fr_docs[doc['id']]
                        doc['stats'] = {
                            'date_range': fr_doc.stats['date_range'],
                            'count': fr_doc.stats['count']
                        } if fr_doc.stats else {'count': 0}
                        doc['summary'] = fr_doc.get_summary()
                        doc['comments_open'] = 'Comment_Due_Date' in fr_doc.details and fr_doc.details['Comment_Due_Date'] > datetime.datetime.now()

                        if doc['summary']:
                            summaries.append(doc['summary'])
                    else:
                        doc['stats'] = {'count': 0, 'comments_open': False}
                        doc['summary'] = None

                # remove duplicates, if any
                tmp = stats['doc_info']['fr_docs']
                included = set()
                stats['doc_info']['fr_docs'] = []
                for doc in tmp:
                    if doc['id'] not in included:
                        stats['doc_info']['fr_docs'].append(doc)
                        included.add(doc['id'])

            summary_text = "\n".join(summaries)
            if summary_text:
                similar_dockets = get_similar_dockets(summary_text, kwargs[self.aggregation_field])[:3]
                if similar_dockets:
                    sd = dict([(docket.id, docket.title) for docket in Docket.objects(id__in=similar_dockets).only('id', 'title')])
                    stats['similar_dockets'] = [{
                        'id': docket,
                        'title': sd[docket]
                    } for docket in similar_dockets]

        agency = self.item.agency
        if not agency:
            agency = re.split("[-_]", self.item.id)[0]
        
        if agency:
            agency_meta = list(Agency.objects(id=agency).only("name"))
            if agency_meta:
                out['agency'] = {
                    'id': agency,
                    'name': agency_meta[0].name,
                    'url': '/agency/%s' % agency
                }
            else:
                agency = None
        
        if not agency:
            out['agency'] = None

        return Response(out)