Esempio n. 1
0
    def handle(self, *args, **options):
        self.verbosity = int(options.get('verbosity', 1))
        self.options = options
        self.rate = options.get('rate')
        if not self.rate:
            self.stderr.write("You must specify a rate")
            exit(1)
        if self.rate not in dict(FREQUENCY).keys():
            self.stderr.write("Invalid rate. Rate must be one of: %s" %
                              ', '.join(dict(FREQUENCY).keys()))
            exit(1)

        self.connections = {
            'o': sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r'),
            'oa': sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='r'),
        }

        if self.rate == 'rt':
            self.valid_ids = self.get_new_ids()

        if self.options['simulate']:
            logger.info("******************************************\n"
                        "* SIMULATE MODE - NO EMAILS WILL BE SENT *\n"
                        "******************************************\n")

        self.send_emails()
        self.clean_rt_queue()
Esempio n. 2
0
    def run_query(self, alert, cut_off_date):
        results = None
        error = False
        try:
            if self.verbosity >= 1:
                print "Now running the query: %s" % alert.alertText

            # Set up the data
            data = search_utils.get_string_to_dict(alert.alertText)
            try:
                del data['filed_before']
            except KeyError:
                pass
            data['order_by'] = 'score desc'
            if self.verbosity >= 1:
                print "  Data sent to SearchForm is: %s" % data
            search_form = SearchForm(data)
            if search_form.is_valid():
                cd = search_form.cleaned_data
                if cd['type'] == 'o':
                    cd['filed_after'] = cut_off_date
                elif cd['type'] == 'oa':
                    cd['argued_after'] = cut_off_date
                main_params = search_utils.build_main_query(cd)
                main_params.update({
                    'rows': '20',
                    'start': '0',
                    'hl.tag.pre': '<em><strong>',
                    'hl.tag.post': '</strong></em>',
                    'caller': 'cl_send_alerts',
                })
                if cd['type'] == 'o':
                    conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL,
                                                  mode='r')
                elif cd['type'] == 'oa':
                    conn = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL,
                                                  mode='r')
                results = conn.raw_query(**main_params).execute()
            else:
                print "  Query for alert %s was invalid" % alert.alertText
                print "  Errors from the SearchForm: %s" % search_form.errors
                error = True
        except:
            traceback.print_exc()
            print "  Search for this alert failed: %s" % alert.alertText
            error = True

        if self.verbosity >= 1:
            if results:
                print "  There were %s results" % len(results)
            else:
                print "  There were no results"
        if self.verbosity >= 2:
            print "  The value of results is: %s" % results

        return error, cd['type'], results,
Esempio n. 3
0
def add_or_update_items(items, solr_url=settings.SOLR_OPINION_URL):
    """Adds an item to a solr index.

    This function is for use with the update_index command. It's slightly
    different than the commands below because it expects a Django object,
    rather than a primary key. This rejects the standard Celery advice about
    not passing objects around, but thread safety shouldn't be an issue since
    this is only used by the update_index command, and we want to query and
    build the SearchDocument objects in the task, not in its caller.
    """
    si = sunburnt.SolrInterface(solr_url, mode='w')
    if hasattr(items, "items") or not hasattr(items, "__iter__"):
        # If it's a dict or a single item make it a list
        items = [items]
    search_item_list = []
    for item in items:
        try:
            if type(item) == Audio:
                search_item_list.append(SearchAudioFile(item))
            elif type(item) == Document:
                search_item_list.append(SearchDocument(item))
        except AttributeError:
            print "AttributeError trying to add doc.pk: %s" % item.pk
        except InvalidDocumentError:
            print "Unable to parse document %s" % item.pk

    try:
        si.add(search_item_list)
    except socket.error, exc:
        add_or_update_items.retry(exc=exc, countdown=120)
Esempio n. 4
0
def add_or_update_audio_files(item_pks):
    si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='w')
    item_list = []
    for pk in item_pks:
        item = Audio.objects.get(pk=pk)
        item_list.append(SearchDocument(item))
    si.add(item_list)
    si.commit()
Esempio n. 5
0
def opinion_sitemap_maker(request):
    conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r')
    page = request.GET.get("p")
    start = (int(page) - 1) * items_per_sitemap
    params = {
        'q':
        '*:*',
        'rows':
        items_per_sitemap,
        'start':
        start,
        'fl':
        ','.join([
            'absolute_url',
            'dateFiled',
            'local_path',
            'citeCount',
            'timestamp',
        ]),
        'sort':
        'dateFiled asc',
        'caller':
        'opinion_sitemap_maker',
    }
    search_results_object = conn.raw_query(**params).execute()

    # Translate Solr object into something Django's template can use
    urls = []
    for result in search_results_object:
        url_strs = ['https://www.courtlistener.com%s' % result['absolute_url']]
        if int(result['citeCount']) > 0:
            # Only include this page if there are citations.
            url_strs.append('https://www.courtlistener.com%scited-by/' %
                            result['absolute_url'])
        url_strs.append('https://www.courtlistener.com%sauthorities/' %
                        result['absolute_url'])
        if result.get('local_path') and result.get('local_path') != '':
            url_strs.append('https://www.courtlistener.com/%s' %
                            result['local_path'])

        sitemap_item = {}
        for url_str in url_strs:
            sitemap_item['location'] = url_str
            sitemap_item['changefreq'] = 'yearly'
            sitemap_item['lastmod'] = result['timestamp']
            if any(s in url_str
                   for s in ['authorities', 'cited-by', 'pdf', 'doc', 'wpd']):
                sitemap_item['priority'] = '0.3'
            else:
                sitemap_item['priority'] = '0.5'
            urls.append(dict(sitemap_item))

    xml = smart_str(loader.render_to_string('sitemap.xml', {'urlset': urls}))
    # These links contain case names, so they should get crawled but not
    # indexed
    response = HttpResponse(xml, mimetype='application/xml')
    response['X-Robots-Tag'] = 'noindex, noodp, noarchive, noimageindex'
    return response
Esempio n. 6
0
    def setUp(self):
        # Set up a testing core in Solr and swap it in
        self.core_name = '%s.test-%s' % (self.__module__, time.time())
        create_solr_core(self.core_name)
        swap_solr_core('collection1', self.core_name)
        self.si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='rw')

        # Set up a handy court object
        self.court = Court.objects.get(pk='test')
Esempio n. 7
0
 def items(self, obj):
     conn = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='r')
     params = {
         'q': '*:*',
         'sort': 'dateArgued desc',
         'rows': '20',
         'start': '0',
         'caller': 'AllJurisdictionsPodcast',
     }
     return conn.raw_query(**params).execute()
Esempio n. 8
0
def add_or_update_audio_file(pk, force_commit=True):
    """Updates the document in the index. Called by Document save function.
    """
    si = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='w')
    try:
        si.add(SearchAudioFile(Audio.objects.get(pk=pk)))
        if force_commit:
            si.commit()
    except SolrError, exc:
        add_or_update_audio_file.retry(exc=exc, countdown=30)
Esempio n. 9
0
def add_or_update_doc(pk, commit=True):
    """Updates the document in the index. Called by Document save function.
    """
    si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='w')
    try:
        si.add(SearchDocument(Document.objects.get(pk=pk)))
        if commit:
            si.commit()
    except SolrError, exc:
        add_or_update_doc.retry(exc=exc, countdown=30)
Esempio n. 10
0
def place_facet_queries(cd,
                        conn=sunburnt.SolrInterface(settings.SOLR_OPINION_URL,
                                                    mode='r')):
    """Get facet values for the status filters

    Using the search form, query Solr and get the values for the status filters.
    """
    # Build up all the queries needed
    facet_params = {
        'rows': '0',
        'facet': 'true',
        'facet.mincount': 0,
        'facet.field': '{!ex=dt}status_exact',
        'q': cd['q'] or '*:*',
        'caller': 'facet_parameters',
    }
    fq = []

    # Case Name and judges
    if cd['case_name']:
        fq.append(make_fq(cd, 'caseName', 'case_name'))
    if cd['judge']:
        fq.append(make_fq(cd, 'judge', 'judge'))

    # Citations
    if cd['citation']:
        fq.append(make_fq(cd, 'citation', 'citation'))
    if cd['docket_number']:
        fq.append(make_fq(cd, 'docketNumber', 'docket_number'))
    if cd['neutral_cite']:
        fq.append(make_fq(cd, 'neutralCite', 'neutral_cite'))

    fq.append(
        make_date_query('dateFiled', cd['filed_before'], cd['filed_after']))
    fq.append(make_cite_count_query(cd))

    # Faceting
    selected_courts_string = get_selected_field_string(
        cd, 'court_')  # Status facets depend on court checkboxes
    selected_stats_string = get_selected_field_string(cd, 'stat_')
    if len(selected_stats_string) > 0:
        fq.extend([
            '{!tag=dt}status_exact:(%s)' % selected_stats_string,
            'court_exact:(%s)' % selected_courts_string
        ])

    # If a param has been added to the fq variables, then we add them to the
    # main_params var. Otherwise, we don't, as doing so throws an error.
    if len(fq) > 0:
        facet_params['fq'] = fq

    stat_facet_fields = conn.raw_query(
        **facet_params).execute().facet_counts.facet_fields

    return stat_facet_fields
Esempio n. 11
0
def update_cite(citation_id, commit=True):
    """If a citation and a document are both updated simultaneously, we will
    needlessly update the index twice. No easy way around it.
    """
    si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='w')
    cite = Citation.objects.get(pk=citation_id)
    for doc in cite.parent_documents.all():
        search_doc = SearchDocument(doc)
        si.add(search_doc)
    if commit:
        si.commit()
Esempio n. 12
0
 def items(self, obj):
     """Do a Solr query here. Return the first 20 results"""
     conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r')
     params = {
         'q': '*:*',
         'sort': 'dateFiled desc',
         'rows': '20',
         'start': '0',
         'caller': 'AllJurisdictionsFeed',
     }
     return conn.raw_query(**params).execute()
Esempio n. 13
0
def match_citation(citation, citing_doc):
    # TODO: Create shared solr connection to use across multiple citations/
    # documents
    conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r')
    main_params = {'fq': []}
    # Set up filter parameters
    start_year = 1750
    end_year = date.today().year
    if citation.year:
        start_year = end_year = citation.year
    else:
        if citation.lookup_index:
            # Some cases can't be disambiguated.
            reporter_dates = REPORTERS[citation.canonical_reporter][
                citation.lookup_index]['editions'][citation.reporter]
            if hasattr(reporter_dates['start'], 'year'):
                start_year = reporter_dates['start'].year
            else:
                start_year = 1750
            if hasattr(reporter_dates['end'], 'year'):
                end_year = reporter_dates['end'].year
            else:
                end_year = 2030
        if citing_doc.date_filed:
            end_year = min(end_year, citing_doc.date_filed.year)
    date_param = 'dateFiled:%s' % build_date_range(start_year, end_year)
    main_params['fq'].append(date_param)
    if citation.court:
        court_param = 'court_exact:%s' % citation.court
        main_params['fq'].append(court_param)

    # Non-precedential documents shouldn't be cited
    main_params['fq'].append('status:Precedential')

    # Take 1: Use citation
    citation_param = 'citation:"%s"' % citation.base_citation()
    main_params['fq'].append(citation_param)
    main_params['caller'] = 'citations'
    results = conn.raw_query(**main_params).execute()
    if len(results) == 1:
        return results, True
    if len(results) > 1:
        if citation.defendant:  # Refine using defendant, if there is one
            results = case_name_query(conn, main_params, citation, citing_doc)
        return results, True

    # Take 2: Use case name
    if not citation.defendant:
        return [], False
        # Remove citation parameter
    main_params['fq'].remove(citation_param)
    return case_name_query(conn, main_params, citation, citing_doc), False
Esempio n. 14
0
 def items(self, obj):
     """
     Returns a list of items to publish in this feed.
     """
     conn = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='r')
     params = {
         'q': '*:*',
         'fq': 'court_exact:%s' % obj.pk,
         'sort': 'dateArgued desc',
         'rows': '20',
         'start': '0',
         'caller': 'JurisdictionPodcast',
     }
     return conn.raw_query(**params).execute()
Esempio n. 15
0
def do_search(request, rows=20, order_by=None, type=None):

    # Bind the search form.
    search_form = SearchForm(request.GET)
    if search_form.is_valid():
        cd = search_form.cleaned_data
        # Allows an override by calling methods.
        if order_by:
            cd['order_by'] = order_by
        if type:
            cd['type'] = type
        search_form = _clean_form(request, cd)

        try:
            if cd['type'] == 'o':
                conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL,
                                              mode='r')
                stat_facet_fields = search_utils.place_facet_queries(cd, conn)
                status_facets = search_utils.make_stats_variable(
                    stat_facet_fields, search_form)
            elif cd['type'] == 'oa':
                conn = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL,
                                              mode='r')
                status_facets = None
            results_si = conn.raw_query(**search_utils.build_main_query(cd))

            courts = Court.objects.filter(in_use=True).values(
                'pk', 'short_name', 'jurisdiction',
                'has_oral_argument_scraper')
            courts, court_count_human, court_count = search_utils\
                .merge_form_with_courts(courts, search_form)

        except Exception, e:
            logger.warning("Error loading search page with request: %s" %
                           request.GET)
            logger.warning("Error was %s" % e)
            return {'error': True}
Esempio n. 16
0
 def items(self, obj):
     search_form = SearchForm(obj.GET)
     if search_form.is_valid():
         cd = search_form.cleaned_data
         conn = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='r')
         main_params = search_utils.build_main_query(cd, highlight=False)
         main_params.update({
             'sort': 'dateArgued desc',
             'rows': '20',
             'start': '0',
             'caller': 'SearchFeed',
         })
         return conn.raw_query(**main_params).execute()
     else:
         return []
Esempio n. 17
0
def index_sitemap_maker(request):
    """Generate a sitemap index page

    Counts the number of cases in the site, divides by `items_per_sitemap` and
    provides links items.
    """
    params = {
        'q': '*:*',
        'rows': '0',  # just need the count
        'start': '0',
        'caller': 'sitemap_index',
    }
    connection_string_obj_type_pairs = (
        (settings.SOLR_OPINION_URL, 'opinions'),
        (settings.SOLR_AUDIO_URL, 'oral-arguments'),
    )
    sites = []
    for connection_string, obj_type in connection_string_obj_type_pairs:
        conn = sunburnt.SolrInterface(connection_string, mode='r')
        search_results_object = conn.raw_query(**params).execute()
        count = search_results_object.result.numFound
        num_pages = count / items_per_sitemap + 1
        for i in range(1, num_pages + 1):
            sites.append(
                'https://www.courtlistener.com/sitemap-%s.xml?p=%s' % (obj_type, i)
            )

    # Random additional sitemaps.
    sites.extend([
        'https://www.courtlistener.com/sitemap-donate.xml',
    ])

    xml = loader.render_to_string('sitemap_index.xml', {'sitemaps': sites})

    # These links contain case names, so they should get crawled but not
    # indexed
    response = HttpResponse(xml, mimetype='application/xml')
    response['X-Robots-Tag'] = 'noindex, noodp, noarchive, noimageindex'
    return response
Esempio n. 18
0
    def handle(self, *args, **options):
        both_list_and_endpoints = (options.get('doc_id') is not None and
                                   (options.get('start_id') is not None or
                                    options.get('end_id') is not None or
                                    options.get('filed_after') is not None))
        no_option = (not any([options.get('doc_id') is None,
                              options.get('start_id') is None,
                              options.get('end_id') is None,
                              options.get('filed_after') is None,
                              options.get('all') is False]))
        if both_list_and_endpoints or no_option:
            raise CommandError('Please specify either a list of documents, a '
                               'range of ids, a range of dates, or '
                               'everything.')

        if options.get('filed_after'):
            start_date = make_aware(datetime.strptime(options['filed_after'], '%Y-%m-%d'), utc)

        self.index = options['index'].lower()
        self.si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='rw')

        # Use query chaining to build the query
        query = Document.objects.all()
        if options.get('doc_id'):
            query = query.filter(pk=options.get('doc_id'))
        if options.get('end_id'):
            query = query.filter(pk__lte=options.get('end_id'))
        if options.get('start_id'):
            query = query.filter(pk__gte=options.get('start_id'))
        if options.get('filed_after'):
            query = query.filter(date_filed__gte=start_date)
        if options.get('all'):
            query = Document.objects.all()
        count = query.count()
        docs = queryset_generator(query, chunksize=10000)
        self.update_documents(docs, count)
Esempio n. 19
0
    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')

        # Set up testing cores in Solr and swap them in
        self.core_name_opinion = '%s.opinion-test-%s' % \
                                 (self.__module__, time.time())
        self.core_name_audio = '%s.audio-test-%s' % \
                               (self.__module__, time.time())
        create_solr_core(self.core_name_opinion)
        create_solr_core(
            self.core_name_audio,
            schema=os.path.join(settings.INSTALL_ROOT, 'Solr', 'conf',
                                'audio_schema.xml'),
            instance_dir='/usr/local/solr/example/solr/audio',
        )
        swap_solr_core('collection1', self.core_name_opinion)
        swap_solr_core('audio', self.core_name_audio)
        self.si_opinion = sunburnt.SolrInterface(settings.SOLR_OPINION_URL,
                                                 mode='rw')
        self.si_audio = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL,
                                               mode='rw')

        # Add three documents and three audio files to the index, but don't
        # extract their contents
        self.site_opinion = test_opinion_scraper.Site().parse()
        self.site_audio = test_oral_arg_scraper.Site().parse()
        cite_counts = (4, 6, 8)
        self.docs = {}
        for i in range(0, 3):
            cite = Citation(
                case_name=self.site_opinion.case_names[i],
                docket_number=self.site_opinion.docket_numbers[i],
                neutral_cite=self.site_opinion.neutral_citations[i],
                federal_cite_one=self.site_opinion.west_citations[i],
            )
            cite.save(index=False)
            docket = Docket(
                case_name=self.site_opinion.case_names[i],
                court=self.court,
            )
            docket.save()
            self.docs[i] = Document(
                date_filed=self.site_opinion.case_dates[i],
                citation=cite,
                docket=docket,
                precedential_status=self.site_opinion.precedential_statuses[i],
                citation_count=cite_counts[i],
                nature_of_suit=self.site_opinion.nature_of_suit[i],
                judges=self.site_opinion.judges[i],
            )
            self.docs[i].save()

        # Create citations between the documents
        # 0 ---cites--> 1, 2
        # 1 ---cites--> 2
        # 2 ---cites--> 0
        self.docs[0].cases_cited.add(self.docs[1].citation)
        self.docs[0].cases_cited.add(self.docs[2].citation)
        self.docs[1].cases_cited.add(self.docs[2].citation)
        self.docs[2].cases_cited.add(self.docs[0].citation)

        for doc in self.docs.itervalues():
            doc.save()

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        Command().scrape_court(site, full_crawl=True)

        self.expected_num_results_opinion = 3
        self.expected_num_results_audio = 2
        self.si_opinion.commit()
        self.si_audio.commit()
Esempio n. 20
0
def get_dup_stats(doc):
    """The heart of the duplicate algorithm. Returns stats about the case as
    compared to other cases already in the system. Other methods can call this
    one, and can make decisions based on the stats generated here.

    If no likely duplicates are encountered, stats are returned as zeroes.

    Process:
        1. Refine the possible result set down to just a few candidates.
        2. Determine their likelihood of being duplicates according to a
           number of measures:
            - Similarity of case name
            - Similarity of docket number
            - Comparison of content length
    """
    conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r')
    DEBUG = True

    ##########################################
    # 1: Refine by date, court and case name #
    ##########################################
    main_params = make_case_name_solr_query(
        doc.citation.case_name,
        doc.docket.court_id,
        doc.date_filed,
        DEBUG=DEBUG,
    )
    main_params['caller'] = 'corpus_importer'
    if DEBUG:
        print "    - main_params are: %s" % main_params
    candidates = conn.raw_query(**main_params).execute()

    if not len(candidates) and doc.citation.docket_number is not None:
        # Try by docket number rather than case name
        clean_docket_number_words = []
        for word in doc.citation.docket_number.split():
            if not re.search('\d', word):
                # Must have numbers.
                continue
            word = word.strip(string.punctuation)
            regex = re.compile('[%s]' % re.escape(string.punctuation))
            if regex.search(re.sub('-', '', word)):
                # Can only have hyphens after stripping
                continue
            clean_docket_number_words.append(word)
        docket_q = ' OR '.join(clean_docket_number_words)
        if docket_q:
            main_params = {
                'fq': [
                    'court_exact:%s' % doc.docket.court_id,
                    'dateFiled:%s' %
                    build_date_range(doc.date_filed, range=15),
                    'docketNumber:(%s)' % docket_q
                ],
                'rows':
                100,
                'caller':
                'corpus_importer',
            }
            if DEBUG:
                print "    - main_params are: %s" % main_params
            candidates = conn.raw_query(**main_params).execute()

    if not len(candidates) and doc.docket.court_id == 'scotus':
        if doc.citation.federal_cite_one:
            # Scotus case, try by citation.
            main_params = {
                'fq': [
                    'court_exact:%s' % doc.docket.court_id,
                    'dateFiled:%s' % build_date_range(
                        doc.date_filed, range=90),  # Creates ~6 month span.
                    'citation:(%s)' % ' '.join([
                        re.sub(r"\D", '', w)
                        for w in doc.citation.federal_cite_one.split()
                    ])
                ],
                'rows':
                100,
                'caller':
                'corpus_importer',
            }
            if DEBUG:
                print "    - main_params are: %s" % main_params
            candidates = conn.raw_query(**main_params).execute()

    stats = {'candidate_count': len(candidates)}
    if not len(candidates):
        return stats, candidates

    #########################################
    # 2: Attempt filtering by docket number #
    #########################################
    # Two-step process. First we see if we have any exact hits.
    # Second, if there were exact hits, we forward those onwards. If not, we
    # forward everything.
    remaining_candidates = []
    if doc.citation.docket_number:
        new_docket_number = re.sub("(\D|0)", "", doc.citation.docket_number)
        for candidate in candidates:
            if candidate.get('docketNumber'):
                # Get rid of anything in the docket numbers that's not a digit
                result_docket_number = re.sub("(\D|0)", "",
                                              candidate['docketNumber'])
                # Get rid of zeroes too.
                if new_docket_number == result_docket_number:
                    remaining_candidates.append(candidate)

    if len(remaining_candidates) > 0:
        # We had one or more exact hits! Use those.
        candidates = remaining_candidates
    else:
        # We just let candidates from step one get passed through by doing nothing.
        pass

    stats = {'candidate_count': len(candidates)}

    ##############################
    # 3: Find the best case name #
    ##############################
    confidences = find_confidences(candidates, doc.citation.case_name)
    stats['case_name_similarities'] = confidences

    #####################################################################
    # 4: Check content length, gestalt difference and cosine similarity #
    #####################################################################
    percent_diffs, gestalt_diffs, cos_sims = [], [], []
    new_stripped_content = re.sub('\W', '', doc.body_text).lower()
    for candidate in candidates:
        candidate_stripped_content = re.sub('\W', '',
                                            candidate['text']).lower()

        # Calculate the difference in text length and their gestalt difference
        try:
            length_diff = abs(
                len(candidate_stripped_content) - len(new_stripped_content))
        except ZeroDivisionError:
            length_diff = 0
        try:
            percent_diff = float(length_diff) / len(new_stripped_content)
        except ZeroDivisionError:
            percent_diff = 0
        cos_sim = get_cosine_similarity(doc.body_text, candidate['text'])
        percent_diffs.append(percent_diff)
        gestalt_diffs.append(
            gen_diff_ratio(candidate_stripped_content, new_stripped_content))
        cos_sims.append(cos_sim)

    stats['length_diffs'] = percent_diffs
    stats['gestalt_diffs'] = gestalt_diffs
    stats['cos_sims'] = cos_sims

    return stats, candidates
Esempio n. 21
0
    def handle(self, *args, **options):
        self.verbosity = int(options.get('verbosity', 1))
        if options.get('solr_url'):
            self.solr_url = options.get('solr_url')
            self.si = sunburnt.SolrInterface(options.get('solr_url'),
                                             mode='rw')
        else:
            self.stderr.write("solr-url is a required parameter.\n")
            exit(1)

        t = options.get('type')
        if t is not None and t.lower() == 'opinions':
            self.type = Document
        elif t is not None and t == 'audio':
            self.type = Audio
        else:
            self.stderr.write('Unable to parse --type argument. See help for '
                              'details.')
            exit(1)

        if options.get('datetime'):
            try:
                # Parse the date string into a datetime object
                dt = make_aware(datetime.datetime(
                    *time.strptime(args[0], '%Y-%m-%d %H:%M:%S')[0:6]), utc)
            except ValueError:
                try:
                    dt = make_aware(datetime.datetime(
                        *time.strptime(args[0], '%Y-%m-%d')[0:5]), utc)
                except ValueError:
                    self.stderr.write('Unable to parse time. Please use '
                                      'format: YYYY-MM-DD HH:MM:SS or '
                                      'YYYY-MM-DD.\n')
                    sys.exit(1)

        if options.get('update_mode'):
            if self.verbosity >= 1:
                self.stdout.write('Running in update mode...\n')
            if options.get('everything'):
                self.add_or_update_all()
            elif options.get('datetime'):
                self.add_or_update_by_datetime(dt)
            elif options.get('query'):
                self.stderr.write("Updating by query not yet implemented.")
                sys.exit(1)
            elif options.get('item'):
                for item in args:
                    try:
                        int(item)
                    except ValueError:
                        self.stderr.write('Error: Item "%s" could not be '
                                          'converted to an ID.\n' % item)
                        sys.exit(1)
                self.add_or_update(*args)
            else:
                self.stderr.write('Error: You must specify what you wish to '
                                  'update.\n')
                sys.exit(1)

        elif options.get('delete_mode'):
            if self.verbosity >= 1:
                self.stdout.write('Running in deletion mode...\n')
            if options.get('everything'):
                self.delete_all()
            elif options.get('datetime'):
                self.delete_by_datetime(dt)
            elif options.get('query'):
                self.delete_by_query(options.get('query'))
            elif options.get('item'):
                for item in args:
                    try:
                        int(item)
                    except ValueError:
                        self.stderr.write('Error: Item "%s" could not be '
                                          'converted to an ID.\n' % item)
                        sys.exit(1)
                self.delete(*args)
            else:
                self.stderr.write('Error: You must specify what you wish to '
                                  'delete.\n')
                sys.exit(1)

        elif options.get('do_commit'):
            self.commit()

        elif options.get('optimize_mode'):
            self.optimize()

        else:
            self.stderr.write('Error: You must specify whether you wish to '
                              'update, delete, commit, or optimize your '
                              'index.\n')
            sys.exit(1)
Esempio n. 22
0
def delete_item(pk, solr_url):
    """Deletes the item from the index.
    """
    si = sunburnt.SolrInterface(solr_url, mode='w')
    si.delete(pk)
    si.commit()
# -*- coding: utf-8 -*-
import os
import sys

execfile('/etc/courtlistener')
sys.path.append(INSTALL_ROOT)
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings")
from django.conf import settings

from alert.lib.mojibake import fix_mojibake
from alert.lib import sunburnt
from alert.search.models import Document

from optparse import OptionParser

conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r')


def cleaner(simulate=False, verbose=True):
    """Fix cases that have mojibake as a result of pdffactory 3.51."""

    # Find all the cases using Solr
    results_si = conn.raw_query(**{
        'q': u'ÚÑÎ',
        'caller': 'mojibake',
    })
    for result in results_si:
        # For each document
        doc = Document.objects.get(pk=result['id'])
        if verbose:
            print "https://www.courtlistener.com" + doc.get_absolute_url()
Esempio n. 24
0
def delete_items(items):
    si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='w')
    si.delete(list(items))
    si.commit()