コード例 #1
0
ファイル: clustering.py プロジェクト: pombredanne/sparerib
    def get(self, request, item_id, item_type="docket"):
        if item_type == "document":
            doc = Doc.objects.only("docket_id").get(id=item_id)
            self.kwargs['docket_id'] = doc.docket_id
        else:
            self.kwargs['docket_id'] = item_id

        hierarchy = self.corpus.hierarchy()

        out = {
            'docket_teaser': {
                '0.5': {'count': self._count_clusters(hierarchy, 0.5)},
                '0.8': {'count': self._count_clusters(hierarchy, 0.8)}
            }
        }

        if item_type == 'document':
            out['document_teaser'] = None
            docs = self.corpus.docs_by_metadata('document_id', item_id)
            if docs:
                out['document_teaser'] = {}
                doc_id = docs[0]

                cluster05 = find_doc_in_hierarchy(hierarchy, doc_id, 0.5)
                if cluster05:
                    out['document_teaser'] = {'0.5': {'count': cluster05['size'], 'id': doc_id}}
                    
                    cluster08 = find_doc_in_hierarchy(hierarchy, doc_id, 0.8)
                    if cluster08:
                        out['document_teaser']['0.8'] = {'count': cluster08['size'], 'id': doc_id}

        return Response(out)
コード例 #2
0
    def get(self, request, docket_id):
        docket = Docket.objects.get(id=docket_id)

        hierarchy = self.corpus.hierarchy(
            request.GET.get('require_summaries', "").lower() == "true")
        total_clustered = sum([cluster['size'] for cluster in hierarchy])

        out = {
            'cluster_hierarchy':
            sorted(hierarchy, key=lambda x: x['size'], reverse=True),
            'stats': {
                'clustered':
                total_clustered,
                'unclustered':
                docket.stats['count'] -
                total_clustered if 'count' in docket.stats else None,
                'date_range':
                docket.stats['date_range']
                if 'date_range' in docket.stats else None
            },
            'prepopulate':
            None
        }

        # populate agency info
        agency = docket.agency or docket.id.split("-")[0]
        if agency:
            agency_meta = list(Agency.objects(id=agency).only("name"))
            if agency_meta:
                out['stats']['agency'] = {
                    'id': agency,
                    'name': agency_meta[0].name,
                    'url': '/agency/%s' % agency
                }
            else:
                out['stats']['agency'] = None

        # choose a cluster and document to prepopulate if one hasn't been requested
        prepop = int(request.GET.get('prepopulate_document', -1))
        if prepop > -1:
            pp_cluster = find_doc_in_hierarchy(hierarchy, prepop, self.cutoff)
            if pp_cluster:
                out['prepopulate'] = {
                    'document': prepop,
                    'cluster': pp_cluster['name'],
                    'cutoff': self.cutoff
                }
        if not out['prepopulate'] and out['stats']['clustered'] > 0:
            pp_cluster = find_doc_in_hierarchy(
                hierarchy, out['cluster_hierarchy'][0]['name'],
                out['cluster_hierarchy'][0]['cutoff'])
            out['prepopulate'] = {
                'document': pp_cluster['members'][0],
                'cluster': pp_cluster['name'],
                'cutoff': out['cluster_hierarchy'][0]['cutoff']
            }

        remove_members(out['cluster_hierarchy'])

        return Response(out)
コード例 #3
0
ファイル: clustering.py プロジェクト: pombredanne/sparerib
    def get(self, request, docket_id):
        docket = Docket.objects.get(id=docket_id)

        hierarchy = self.corpus.hierarchy(request.GET.get('require_summaries', "").lower()=="true")
        total_clustered = sum([cluster['size'] for cluster in hierarchy])
        
        out = {
            'cluster_hierarchy': sorted(hierarchy, key=lambda x: x['size'], reverse=True),
            'stats': {
                'clustered': total_clustered,
                'unclustered': docket.stats['count'] - total_clustered if 'count' in docket.stats else None,
                'date_range': docket.stats['date_range'] if 'date_range' in docket.stats else None
            },
            'prepopulate': None
        }

        # populate agency info
        agency = docket.agency
        if agency:
            agency_meta = list(Agency.objects(id=agency).only("name"))
            if agency_meta:
                out['stats']['agency'] = {
                    'id': agency,
                    'name': agency_meta[0].name,
                    'url': '/agency/%s' % agency
                }
            else:
                out['stats']['agency'] = None

        # choose a cluster and document to prepopulate if one hasn't been requested
        prepop = int(request.GET.get('prepopulate_document', -1))
        if prepop > -1:
            pp_cluster = find_doc_in_hierarchy(hierarchy, prepop, self.cutoff)
            if pp_cluster:
                out['prepopulate'] = {
                    'document': prepop,
                    'cluster': pp_cluster['name'],
                    'cutoff': self.cutoff
                }
        if not out['prepopulate'] and out['stats']['clustered'] > 0:
            pp_cluster = find_doc_in_hierarchy(hierarchy, out['cluster_hierarchy'][0]['name'], out['cluster_hierarchy'][0]['cutoff'])
            out['prepopulate'] = {
                'document': pp_cluster['members'][0],
                'cluster': pp_cluster['name'],
                'cutoff': out['cluster_hierarchy'][0]['cutoff']
            }

        remove_members(out['cluster_hierarchy'])

        return Response(out)
コード例 #4
0
ファイル: clustering.py プロジェクト: apendleton/sparerib
    def get(self, request, docket_id, cluster_id):
        cluster_id = int(cluster_id)
        
        h = self.corpus.hierarchy()
        cluster = find_doc_in_hierarchy(h, cluster_id, self.cutoff)

        # consider caching for very large clusters
        _metadatas = lambda: dict(self.corpus.doc_metadatas(cluster['members']))
        members = tuple(cluster['members'])
        if len(members) > 1000:
            key = 'sparerib_api.clustering.cluster-%s-%s-%s' % (docket_id, cluster_id, hash(members))
            
            metadatas = cache.get(key)
            if not metadatas:
                metadatas = _metadatas()
                cache.set(key, metadatas)
        else:
            # it's little; don't cache
            metadatas = _metadatas()

        return Response({
            'id': cluster['name'],
            'documents': [{
                'id': doc_id,
                'title': metadatas[doc_id]['title'],
                'submitter': ', '.join([metadatas[doc_id][field] for field in ['submitter_name', 'submitter_organization'] if field in metadatas[doc_id] and metadatas[doc_id][field]])
            } for doc_id in cluster['members']]
        })
コード例 #5
0
    def get(self, request, item_id, item_type="docket"):
        if item_type == "document":
            doc = Doc.objects.only("docket_id").get(id=item_id)
            self.kwargs['docket_id'] = doc.docket_id
        else:
            self.kwargs['docket_id'] = item_id

        hierarchy = self.corpus.hierarchy()

        out = {
            'docket_teaser': {
                '0.5': {
                    'count': self._count_clusters(hierarchy, 0.5)
                },
                '0.8': {
                    'count': self._count_clusters(hierarchy, 0.8)
                }
            }
        }

        if item_type == 'document':
            out['document_teaser'] = None
            docs = self.corpus.docs_by_metadata('document_id', item_id)
            if docs:
                out['document_teaser'] = {}
                doc_id = docs[0]

                cluster05 = find_doc_in_hierarchy(hierarchy, doc_id, 0.5)
                if cluster05:
                    out['document_teaser'] = {
                        '0.5': {
                            'count': cluster05['size'],
                            'id': doc_id
                        }
                    }

                    cluster08 = find_doc_in_hierarchy(hierarchy, doc_id, 0.8)
                    if cluster08:
                        out['document_teaser']['0.8'] = {
                            'count': cluster08['size'],
                            'id': doc_id
                        }

        return Response(out)
コード例 #6
0
    def get(self, request, docket_id, cluster_id, document_id):
        document_id = int(document_id)
        cluster_id = int(cluster_id)

        h = self.corpus.hierarchy()
        cluster = find_doc_in_hierarchy(h, cluster_id, self.cutoff)['members']

        doc = self.corpus.doc(document_id)
        text = doc['text']
        raw_phrases = self.corpus.phrase_overlap(document_id, cluster)

        frequencies = numpy.zeros(len(text), 'l')
        for phrase in raw_phrases.values():
            for occurrence in phrase['indexes']:
                frequencies[occurrence.start:occurrence.end] = numpy.maximum(
                    frequencies[occurrence.start:occurrence.end],
                    phrase['count'])

        freq_ranges = [(f[0], len(list(f[1])))
                       for f in itertools.groupby(frequencies)]
        cluster_size = float(len(cluster))

        components = []
        cursor = 0
        for fr in freq_ranges:
            components.append((fr[0], text[cursor:cursor + fr[1]]))
            cursor += fr[1]

        html = ''.join([
            '<span style="background-color:rgba(160,211,216,%s)">%s</span>' %
            (round(p[0] / cluster_size, 2), p[1]) for p in components
        ])
        html = html.replace("\n", "<br />")
        return Response({
            'metadata': {
                'title':
                doc['metadata'].get('title', None),
                'submitter':
                ', '.join([
                    doc['metadata'][field]
                    for field in ['submitter_name', 'submitter_organization']
                    if field in doc['metadata'] and doc['metadata'][field]
                ]),
                'document_id':
                doc['metadata'].get('document_id', None),
            },
            'frequency_html': html,
            'truncated': len(doc['text']) == 10000
        })
コード例 #7
0
ファイル: clustering.py プロジェクト: pombredanne/sparerib
    def get(self, request, docket_id, cluster_id):
        cluster_id = int(cluster_id)
        
        h = self.corpus.hierarchy()
        cluster = find_doc_in_hierarchy(h, cluster_id, self.cutoff)

        metadatas = dict(self.corpus.doc_metadatas(cluster['members']))

        return Response({
            'id': cluster['name'],
            'documents': [{
                'id': doc_id,
                'title': metadatas[doc_id]['title'],
                'submitter': ', '.join([metadatas[doc_id][field] for field in ['submitter_name', 'submitter_organization'] if field in metadatas[doc_id] and metadatas[doc_id][field]])
            } for doc_id in cluster['members']]
        })
コード例 #8
0
    def get(self, request, docket_id, cluster_id):
        cluster_id = int(cluster_id)

        h = self.corpus.hierarchy()
        cluster = find_doc_in_hierarchy(h, cluster_id, self.cutoff)

        # consider caching for very large clusters
        _metadatas = lambda: dict(self.corpus.doc_metadatas(cluster['members'])
                                  )
        members = tuple(cluster['members'])
        if len(members) > 1000:
            key = 'sparerib_api.clustering.cluster-%s-%s-%s' % (
                docket_id, cluster_id, hash(members))

            metadatas = cache.get(key)
            if not metadatas:
                metadatas = _metadatas()
                cache.set(key, metadatas)
        else:
            # it's little; don't cache
            metadatas = _metadatas()

        return Response({
            'id':
            cluster['name'],
            'documents': [{
                'id':
                doc_id,
                'title':
                metadatas[doc_id]['title'],
                'submitter':
                ', '.join([
                    metadatas[doc_id][field]
                    for field in ['submitter_name', 'submitter_organization']
                    if field in metadatas[doc_id] and metadatas[doc_id][field]
                ])
            } for doc_id in cluster['members']]
        })
コード例 #9
0
ファイル: clustering.py プロジェクト: pombredanne/sparerib
    def get(self, request, docket_id, cluster_id, document_id):
        document_id = int(document_id)
        cluster_id = int(cluster_id)

        h = self.corpus.hierarchy()
        cluster = find_doc_in_hierarchy(h, cluster_id, self.cutoff)['members']

        doc = self.corpus.doc(document_id)
        text = doc['text']
        raw_phrases = self.corpus.phrase_overlap(document_id, cluster)
        
        frequencies = numpy.zeros(len(text), 'l')
        for phrase in raw_phrases.values():
            for occurrence in phrase['indexes']:
                frequencies[occurrence.start:occurrence.end] = numpy.maximum(frequencies[occurrence.start:occurrence.end], phrase['count'])

        freq_ranges = [(f[0], len(list(f[1]))) for f in itertools.groupby(frequencies)]
        cluster_size = float(len(cluster))

        components = []
        cursor = 0
        for fr in freq_ranges:
            components.append((fr[0], text[cursor:cursor + fr[1]]))
            cursor += fr[1]

        html = ''.join(['<span style="background-color:rgba(160,211,216,%s)">%s</span>' % (round(p[0]/cluster_size, 2), p[1]) for p in components])
        html = html.replace("\n", "<br />")
        return Response({
            'metadata': {
                'title': doc['metadata'].get('title', None),
                'submitter': ', '.join([doc['metadata'][field] for field in ['submitter_name', 'submitter_organization'] if field in doc['metadata'] and doc['metadata'][field]]),
                'document_id': doc['metadata'].get('document_id', None),
            },
            'frequency_html': html,
            'truncated': len(doc['text']) == 10000
        })