def _download(self, request_dict={}):
        """Methods for downloading the latest version of Site
        """
        if self.method == 'POST':
            truncated_params = {}
            for k, v in self.parameters.iteritems():
                truncated_params[k] = trunc(v, 50, ellipsis='...[truncated]')
            logger.info("Now downloading case page at: %s (params: %s)" %
                        (self.url, truncated_params))
        else:
            logger.info("Now downloading case page at: %s" % self.url)

        # Set up verify here and remove it from request_dict so you don't send
        # it to s.get or s.post in two kwargs.
        if request_dict.get('verify') is not None:
            verify = request_dict['verify']
            del request_dict['verify']
        else:
            verify = certifi.where()

        # Get the response. Disallow redirects so they throw an error
        s = requests.session()
        s.mount('https://', self._get_adapter_instance())
        if self.method == 'GET':
            r = s.get(self.url,
                      headers={'User-Agent': 'Juriscraper'},
                      verify=verify,
                      **request_dict)
        elif self.method == 'POST':
            r = s.post(self.url,
                       headers={'User-Agent': 'Juriscraper'},
                       verify=verify,
                       data=self.parameters,
                       **request_dict)
        elif self.method == 'LOCAL':
            mr = MockRequest(url=self.url)
            r = mr.get()

        # Provides a hook for inheriting objects to tweak the request object.
        self.tweak_request_object(r)

        # Throw an error if a bad status code is returned.
        r.raise_for_status()

        # Tweak or set the encoding if needed
        r = self._set_encoding(r)

        # Provide the response in the Site object
        self.r = r
        self.status = r.status_code

        # Grab the content
        if 'json' in r.headers.get('content-type', ''):
            return r.json()
        else:
            text = self._clean_text(r.text)
            html_tree = self._make_html_tree(text)
            html_tree.rewrite_links(self._link_repl)
            return html_tree
def scrape_court(site, binaries=False):
    """Calls the requested court(s), gets its content, then throws it away.

    Note that this is a very basic caller lacking important functionality, such
    as:
     - checking whether the HTML of the page has changed since last visited
     - checking whether the downloaded content is already in your data store
     - saving anything at all

    Nonetheless, this caller is useful for testing, and for demonstrating some
    basic pitfalls that a caller will run into.
    """
    exceptions = defaultdict(list)
    for item in site:
        # First turn the download urls into a utf-8 byte string
        item_download_urls = item["download_urls"].encode("utf-8")
        # Percent encode URLs (this is a Python wart)
        download_url = six_parse.quote(item_download_urls,
                                       safe="%/:=&?~#+!$,;'@()*[]")

        if binaries:
            try:
                opener = six_request.build_opener()
                for cookie_dict in site.cookies:
                    opener.addheaders.append((
                        "Cookie",
                        "%s=%s" % (cookie_dict["name"], cookie_dict["value"]),
                    ))
                data = opener.open(download_url).read()
                # test for empty files (thank you CA1)
                if len(data) == 0:
                    exceptions["EmptyFileError"].append(download_url)
                    v_print(3, "EmptyFileError: %s" % download_url)
                    v_print(3, traceback.format_exc())
                    continue
            except Exception:
                exceptions["DownloadingError"].append(download_url)
                v_print(3, "DownloadingError: %s" % download_url)
                v_print(3, traceback.format_exc())
                continue

            # Extract the data using e.g. antiword, pdftotext, etc., then
            # clean it up.
            data = extract_doc_content(data)
            data = site.cleanup_content(data)

        # Normally, you'd do your save routines here...
        v_print(1, "\nAdding new item:")
        for k, v in item.items():
            if isinstance(v, six.text_type):
                value = trunc(v, 200, ellipsis="...")
                v_print(1, '    %s: "%s"' % (k, value.encode("utf-8")))
            else:
                # Dates and such...
                v_print(1, "    %s: %s" % (k, v))

    v_print(
        3, "\n%s: Successfully crawled %d items." % (site.court_id, len(site)))
    return {"count": len(site), "exceptions": exceptions}
    def _download(self, request_dict={}):
        """Methods for downloading the latest version of Site
        """
        if self.method == 'POST':
            truncated_params = {}
            for k, v in self.parameters.iteritems():
                truncated_params[k] = trunc(v, 50, elipsize=True, elipsis='...[truncated]')
            logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params))
        else:
            logger.info("Now downloading case page at: %s" % self.url)
        # Get the response. Disallow redirects so they throw an error
        s = requests.session()
        if self.method == 'GET':
            r = s.get(self.url,
                      headers={'User-Agent': 'Juriscraper'},
                      **request_dict)
        elif self.method == 'POST':
            r = s.post(self.url,
                       headers={'User-Agent': 'Juriscraper'},
                       data=self.parameters,
                       **request_dict)
        elif self.method == 'LOCAL':
            mr = MockRequest(url=self.url)
            r = mr.get()

        # Provides a hook for inheriting objects to tweak the request object.
        self.tweak_request_object(r)

        # Throw an error if a bad status code is returned.
        r.raise_for_status()

        # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
        if r.encoding == 'ISO-8859-1':
            r.encoding = 'cp1252'

        # Provide the response in the Site object
        self.r = r
        self.status = r.status_code

        if r.encoding is None:
            # Requests detects the encoding when the item is GET'ed using
            # HTTP headers, and then when r.text is accessed, if the encoding
            # hasn't been set by that point. By setting the encoding here, we
            # ensure that it's done by cchardet, if it hasn't been done with
            # HTTP headers. This way it is done before r.text is accessed
            # (which would do it with vanilla chardet). This is a big
            # performance boon, and can be removed once requests is upgraded
            # (https://github.com/kennethreitz/requests/pull/814/)
            r.encoding = chardet.detect(r.content)['encoding']

        # Grab the content
        text = self._clean_text(r.text)
        html_tree = html.fromstring(text)
        html_tree.rewrite_links(self._link_repl)
        return html_tree
    def _download(self, request_dict={}):
        """Methods for downloading the latest version of Site
        """
        if self.method == 'POST':
            truncated_params = {}
            for k, v in self.parameters.iteritems():
                truncated_params[k] = trunc(v, 50, elipsize=True, elipsis='...[truncated]')
            logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params))
        else:
            logger.info("Now downloading case page at: %s" % self.url)
        # Get the response. Disallow redirects so they throw an error
        s = requests.session()
        if self.method == 'GET':
            r = s.get(self.url,
                      headers={'User-Agent': 'Juriscraper'},
                      **request_dict)
        elif self.method == 'POST':
            r = s.post(self.url,
                       headers={'User-Agent': 'Juriscraper'},
                       data=self.parameters,
                       **request_dict)
        elif self.method == 'LOCAL':
            mr = MockRequest(url=self.url)
            r = mr.get()

        # Provides a hook for inheriting objects to tweak the request object.
        self.tweak_request_object(r)

        # Throw an error if a bad status code is returned.
        r.raise_for_status()

        # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
        if r.encoding == 'ISO-8859-1':
            r.encoding = 'cp1252'

        # Provide the response in the Site object
        self.r = r
        self.status = r.status_code

        if r.encoding is None:
            # Requests detects the encoding when the item is GET'ed using
            # HTTP headers, and then when r.text is accessed, if the encoding
            # hasn't been set by that point. By setting the encoding here, we
            # ensure that it's done by cchardet, if it hasn't been done with
            # HTTP headers. This way it is done before r.text is accessed
            # (which would do it with vanilla chardet). This is a big
            # performance boon, and can be removed once requests is upgraded
            # (https://github.com/kennethreitz/requests/pull/814/)
            r.encoding = chardet.detect(r.content)['encoding']

        # Grab the content
        text = self._clean_text(r.text)
        html_tree = html.fromstring(text)
        html_tree.rewrite_links(self._link_repl)
        return html_tree
Exemple #5
0
    def _download(self, request_dict={}):
        """Methods for downloading the latest version of Site
        """
        if self.method == "POST":
            truncated_params = {}
            for k, v in self.parameters.iteritems():
                truncated_params[k] = trunc(v, 50, ellipsis="...[truncated]")
            logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params))
        else:
            logger.info("Now downloading case page at: %s" % self.url)

        # Set up verify here and remove it from request_dict so you don't send
        # it to s.get or s.post in two kwargs.
        if request_dict.get("verify") is not None:
            verify = request_dict["verify"]
            del request_dict["verify"]
        else:
            verify = certifi.where()

        # Get the response. Disallow redirects so they throw an error
        s = requests.session()
        s.mount("https://", self._get_adapter_instance())
        if self.method == "GET":
            r = s.get(self.url, headers={"User-Agent": "Juriscraper"}, verify=verify, **request_dict)
        elif self.method == "POST":
            r = s.post(
                self.url, headers={"User-Agent": "Juriscraper"}, verify=verify, data=self.parameters, **request_dict
            )
        elif self.method == "LOCAL":
            mr = MockRequest(url=self.url)
            r = mr.get()

        # Provides a hook for inheriting objects to tweak the request object.
        self.tweak_request_object(r)

        # Throw an error if a bad status code is returned.
        r.raise_for_status()

        # Tweak or set the encoding if needed
        r = self._set_encoding(r)

        # Provide the response in the Site object
        self.r = r
        self.status = r.status_code

        # Grab the content
        if "json" in r.headers.get("content-type", ""):
            return r.json()
        else:
            text = self._clean_text(r.text)
            html_tree = self._make_html_tree(text)
            html_tree.rewrite_links(self._link_repl)
            return html_tree
def scrape_court(site, binaries=False):
    """Calls the requested court(s), gets its content, then throws it away.

    Note that this is a very basic caller lacking important functionality, such
    as:
     - checking whether the HTML of the page has changed since last visited
     - checking whether the downloaded content is already in your data store
     - saving anything at all

    Nonetheless, this caller is useful for testing, and for demonstrating some
    basic pitfalls that a caller will run into.
    """
    for item in site:
        # Percent encode URLs (this is a Python wart)
        download_url = urllib2.quote(item['download_urls'], safe="%/:=&?~#+!$,;'@()*[]")

        if binaries:
            try:
                opener = urllib2.build_opener()
                for cookie_dict in site.cookies:
                    opener.addheaders.append(("Cookie", "%s=%s" % (cookie_dict['name'], cookie_dict['value'])))
                data = opener.open(download_url).read()
                # test for empty files (thank you CA1)
                if len(data) == 0:
                    v_print(3, 'EmptyFileError: %s' % download_url)
                    v_print(3, traceback.format_exc())
                    continue
            except Exception:
                v_print(3, 'DownloadingError: %s' % download_url)
                v_print(3, traceback.format_exc())
                continue

            # Extract the data using e.g. antiword, pdftotext, etc., then
            # clean it up.
            data = extract_doc_content(data)
            data = site.cleanup_content(data)

        # Normally, you'd do your save routines here...
        v_print(1, '\nAdding new item:')
        for k, v in item.items():
            if isinstance(v, six.text_type):
                value = trunc(v, 200, ellipsis='...')
                v_print(1, '    %s: "%s"' % (k, value.encode('utf-8')))
            else:
                # Dates and such...
                v_print(1, '    %s: %s' % (k, v))

    v_print(3, '\n%s: Successfully crawled %d items.' % (site.court_id, len(site)))
    def _download(self, request_dict={}):
        """Methods for downloading the latest version of Site
        """
        if self.method == 'POST':
            truncated_params = {}
            for k, v in self.parameters.iteritems():
                truncated_params[k] = trunc(v,
                                            50,
                                            elipsize=True,
                                            elipsis='...[truncated]')
            logger.info("Now downloading case page at: %s (params: %s)" %
                        (self.url, truncated_params))
        else:
            logger.info("Now downloading case page at: %s" % self.url)
        # Get the response. Disallow redirects so they throw an error
        s = requests.session()
        if self.method == 'GET':
            r = s.get(self.url,
                      headers={'User-Agent': 'Juriscraper'},
                      **request_dict)
        elif self.method == 'POST':
            r = s.post(self.url,
                       headers={'User-Agent': 'Juriscraper'},
                       data=self.parameters,
                       **request_dict)
        elif self.method == 'LOCAL':
            mr = MockRequest(url=self.url)
            r = mr.get()

        # Provides a hook for inheriting objects to tweak the request object.
        self.tweak_request_object(r)

        # Throw an error if a bad status code is returned.
        r.raise_for_status()

        # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
        if r.encoding == 'ISO-8859-1':
            r.encoding = 'cp1252'

        # Provide the response in the Site object
        self.r = r
        self.status = r.status_code

        # Grab the content
        text = self._clean_text(r.text)
        html_tree = html.fromstring(text)
        html_tree.rewrite_links(self._link_repl)
        return html_tree
def scrape_court(site, binaries=False):
    """Calls the requested court(s), gets its content, then throws it away.

    Note that this is a very basic caller lacking important functionality, such
    as:
     - checking whether the HTML of the page has changed since last visited
     - checking whether the downloaded content is already in your data store
     - saving anything at all

    Nonetheless, this caller is useful for testing, and for demonstrating some
    basic pitfalls that a caller will run into.
    """
    for item in site:
        # Percent encode URLs (this is a Python wart)
        download_url = urllib2.quote(item['download_urls'], safe="%/:=&?~#+!$,;'@()*[]")

        if binaries:
            try:
                opener = urllib2.build_opener()
                for cookie_dict in site.cookies:
                    opener.addheaders.append(("Cookie", "%s=%s" % (cookie_dict['name'], cookie_dict['value'])))
                data = opener.open(download_url).read()
                # test for empty files (thank you CA1)
                if len(data) == 0:
                    v_print(3, 'EmptyFileError: %s' % download_url)
                    v_print(3, traceback.format_exc())
                    continue
            except Exception:
                v_print(3, 'DownloadingError: %s' % download_url)
                v_print(3, traceback.format_exc())
                continue

            # Extract the data using e.g. antiword, pdftotext, etc., then
            # clean it up.
            data = extract_doc_content(data)
            data = site.cleanup_content(data)

        # Normally, you'd do your save routines here...
        v_print(1, 'Adding new item:')
        for k, v in item.items():
            if type(v) == unicode:
                value = trunc(v, 200, ellipsis='...')
                v_print(1, '    %s: "%s"' % (k, value.encode('utf-8')))
            else:
                # Dates and such...
                v_print(1, '    %s: %s' % (k, v))

    v_print(3, '%s: Successfully crawled.' % site.court_id)
Exemple #9
0
    def _download(self, request_dict={}):
        """Methods for downloading the latest version of Site
        """
        if self.method == 'POST':
            truncated_params = {}
            for k, v in self.parameters.iteritems():
                truncated_params[k] = trunc(v, 50, elipsize=True, elipsis='...[truncated]')
            logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params))
        else:
            logger.info("Now downloading case page at: %s" % self.url)
        # Get the response. Disallow redirects so they throw an error
        s = requests.session()
        if self.method == 'GET':
            r = s.get(self.url,
                      headers={'User-Agent': 'Juriscraper'},
                      **request_dict)
        elif self.method == 'POST':
            r = s.post(self.url,
                       headers={'User-Agent': 'Juriscraper'},
                       data=self.parameters,
                       **request_dict)
        elif self.method == 'LOCAL':
            mr = MockRequest(url=self.url)
            r = mr.get()

        # Provides a hook for inheriting objects to tweak the request object.
        self.tweak_request_object(r)

        # Throw an error if a bad status code is returned.
        r.raise_for_status()

        # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
        if r.encoding == 'ISO-8859-1':
            r.encoding = 'cp1252'

        # Provide the response in the Site object
        self.r = r
        self.status = r.status_code

        # Grab the content
        text = self._clean_text(r.text)
        html_tree = html.fromstring(text)
        html_tree.rewrite_links(self._link_repl)
        return html_tree
 def _download(self, request_dict={}):
     """Download the latest version of Site"""
     self.downloader_executed = True
     if self.method == 'POST':
         truncated_params = {}
         for k, v in self.parameters.items():
             truncated_params[k] = trunc(v, 50, ellipsis='...[truncated]')
         logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params))
     else:
         logger.info("Now downloading case page at: %s" % self.url)
     self._process_request_parameters(request_dict)
     if self.method == 'GET':
         self._request_url_get(self.url)
     elif self.method == 'POST':
         self._request_url_post(self.url)
     elif self.method == 'LOCAL':
         self._request_url_mock(self.url)
     self._post_process_response()
     return self._return_response_text_object()
 def _download(self, request_dict={}):
     """Download the latest version of Site"""
     self.downloader_executed = True
     if self.method == 'POST':
         truncated_params = {}
         for k, v in self.parameters.items():
             truncated_params[k] = trunc(v, 50, ellipsis='...[truncated]')
         logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params))
     else:
         logger.info("Now downloading case page at: %s" % self.url)
     self._process_request_parameters(request_dict)
     if self.method == 'GET':
         self._request_url_get(self.url)
     elif self.method == 'POST':
         self._request_url_post(self.url)
     elif self.test_mode_enabled():
         self._request_url_mock(self.url)
     self._post_process_response()
     return self._return_response_text_object()
Exemple #12
0
def get_title(self, referer_id):
    """Get the HTML title for a page, trying again if failures occur.

    Idea here is that somebody will create a new page that embeds one of our
    maps. As soon as they do, we'll get an HTTP referer sent to us, which is
    great. Unfortunately, in many cases, the HTTP referer we receive is that of
    an in progress page or similar, NOT the page that's actually live. Thus,
    what we do is try the URL over and over, until we find success.

    If a title is found, the admins are notified.

    If not, the item is deleted (this is OK, however, b/c it'll be recreated if
    it should have existed).
    """
    # Set the exponential back off in case we need it, starting at 15 minutes,
    # then 30, 60, 120...
    countdown = 15 * 60 * (2**self.request.retries)
    retried_exceeded = (self.request.retries >= self.max_retries)

    referer = Referer.objects.get(pk=referer_id)
    if blacklisted_url(referer.url):
        return

    try:
        r = requests.get(
            referer.url,
            headers={'User-Agent': "CourtListener"},
            verify=False,  # Integrity of a referer's referent is not important.
        )
    except MissingSchema:
        return
    except TooManyRedirects:
        return

    try:
        r.raise_for_status()
    except HTTPError as exc:
        if retried_exceeded:
            # We're not wanted here. Maybe we'll have better luck another time.
            return
        raise self.retry(exc=exc, countdown=countdown)

    html_tree = html.fromstring(r.text)
    try:
        title = getattr(html_tree.xpath('//title')[0], 'text', '')
        if title is not None:
            title = title.strip()
    except IndexError:
        title = ''

    if title:
        referer.page_title = trunc(
            title,
            referer._meta.get_field('page_title').max_length,
        )
        referer.save()

        if new_title_for_viz(referer):
            # Only send the email if we haven't seen this page title before for
            # this visualization.
            email = emails['referer_detected']
            email_body = email['body'] % (
                referer.url, referer.page_title,
                reverse('admin:visualizations_referer_change',
                        args=(referer.pk, )))
            send_mail(email['subject'], email_body, email['from'], email['to'])
    else:
        try:
            # Create an exception to catch.
            raise Exception("Couldn't get title from HTML")
        except Exception as exc:
            if retried_exceeded:
                # We couldn't get the title. Let it go.
                return
            raise self.retry(exc=exc, countdown=countdown)
Exemple #13
0
def scrape_and_parse():
    """Traverses the bulk data from public.resource.org, and puts them in the
    DB.

    Probably lots of ways to go about this, but I think the easiest will be the following:
     - look at the index page of all volumes, and follow all the links it has.
     - for each volume, look at its index page, and follow the link to all cases
     - for each case, collect information wisely.
     - put it all in the DB
    """

    # begin by loading up the fix files into memory
    court_fix_dict, date_fix_dict, case_name_short_dict = load_fix_files()

    results = []
    DEBUG = 4
    # Set to False to disable automatic browser usage. Else, set to the
    # command you want to run, e.g. 'firefox'
    BROWSER = False
    court_fix_file = open('../logs/f2_court_fix_file.txt', 'a')
    date_fix_file = open('../logs/f2_date_fix_file.txt', 'a')
    case_name_short_fix_file = open('../logs/f2_short_case_name_fix_file.txt', 'a')
    vol_file = open('../logs/vol_file.txt', 'r+')
    case_file = open('../logs/case_file.txt', 'r+')

    url = "file://%s/Resource.org/F2/index.html" % INSTALL_ROOT
    openedURL = urllib2.urlopen(url)
    content = openedURL.read()
    openedURL.close()
    tree = fromstring(content)

    volumeLinks = tree.xpath('//table/tbody/tr/td[1]/a')

    try:
        i = int(vol_file.readline())
    except ValueError:
        # the volume file is emtpy or otherwise failing.
        i = 0
    vol_file.close()

    if DEBUG >= 1:
        print "Number of remaining volumes is: %d" % (len(volumeLinks) - i)

    # used later, needs a default value.
    saved_caseDate = None
    saved_court = None
    while i < len(volumeLinks):
        # we iterate over every case in the volume
        volumeURL = volumeLinks[i].text + "/index.html"
        volumeURL = urljoin(url, volumeURL)
        if DEBUG >= 1:
            print "Current volumeURL is: %s" % volumeURL

        openedVolumeURL = urllib2.urlopen(volumeURL)
        content = openedVolumeURL.read()
        volumeTree = fromstring(content)
        openedVolumeURL.close()
        caseLinks = volumeTree.xpath('//table/tbody/tr/td[1]/a')
        caseDates = volumeTree.xpath('//table/tbody/tr/td[2]')
        sha1Hashes = volumeTree.xpath('//table/tbody/tr/td[3]/a')

        # The following loads a serialized placeholder from disk.
        try:
            j = int(case_file.readline())
        except ValueError:
            j = 0
        case_file.close()
        while j < len(caseLinks):
            # iterate over each case, throwing it in the DB
            if DEBUG >= 1:
                print ''
            # like the scraper, we begin with the caseLink field (relative for
            # now, not absolute)
            caseLink = caseLinks[j].get('href')

            # sha1 is easy
            sha1Hash = sha1Hashes[j].text
            if DEBUG >= 4:
                print "SHA1 is: %s" % sha1Hash

            # using the caselink from above, and the volumeURL, we can get the
            # html
            absCaseLink = urljoin(volumeURL, caseLink)
            html = urllib2.urlopen(absCaseLink).read()
            htmlTree = fromstring(html)
            bodyContents = htmlTree.xpath('//body/*[not(@id="footer")]')

            body = ""
            bodyText = ""
            for element in bodyContents:
                body += tostring(element)
                try:
                    bodyText += tostring(element, method='text')
                except UnicodeEncodeError:
                    # Happens with odd characters. Simply pass this iteration.
                    pass
            if DEBUG >= 5:
                print body
                print bodyText

            # need to figure out the court ID
            try:
                courtPs = htmlTree.xpath('//p[@class = "court"]')
                # Often the court ends up in the parties field.
                partiesPs = htmlTree.xpath("//p[@class= 'parties']")
                court = ""
                for courtP in courtPs:
                    court += tostring(courtP).lower()
                for party in partiesPs:
                    court += tostring(party).lower()
            except IndexError:
                court = check_fix_list(sha1Hash, court_fix_dict)
                if not court:
                    print absCaseLink
                    if BROWSER:
                        subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate()
                    court = raw_input("Please input court name (e.g. \"First Circuit of Appeals\"): ").lower()
                    court_fix_file.write("%s|%s\n" % (sha1Hash, court))
            if ('first' in court) or ('ca1' == court):
                court = 'ca1'
            elif ('second' in court) or ('ca2' == court):
                court = 'ca2'
            elif ('third' in court) or ('ca3' == court):
                court = 'ca3'
            elif ('fourth' in court) or ('ca4' == court):
                court = 'ca4'
            elif ('fifth' in court) or ('ca5' == court):
                court = 'ca5'
            elif ('sixth' in court) or ('ca6' == court):
                court = 'ca6'
            elif ('seventh' in court) or ('ca7' == court):
                court = 'ca7'
            elif ('eighth' in court) or ('ca8' == court):
                court = 'ca8'
            elif ('ninth' in court) or ('ca9' == court):
                court = 'ca9'
            elif ("tenth" in court) or ('ca10' == court):
                court = 'ca10'
            elif ("eleventh" in court) or ('ca11' == court):
                court = 'ca11'
            elif ('columbia' in court) or ('cadc' == court):
                court = 'cadc'
            elif ('federal' in court) or ('cafc' == court):
                court = 'cafc'
            elif ('patent' in court) or ('ccpa' == court):
                court = 'ccpa'
            elif (('emergency' in court) and ('temporary' not in court)) or ('eca' == court):
                court = 'eca'
            elif ('claims' in court) or ('uscfc' == court):
                court = 'uscfc'
            else:
                # No luck extracting the court name. Try the fix file.
                court = check_fix_list(sha1Hash, court_fix_dict)
                if not court:
                    # Not yet in the fix file. Check if it's a crazy ca5 case
                    court = ''
                    ca5courtPs = htmlTree.xpath('//p[@class = "center"]')
                    for ca5courtP in ca5courtPs:
                        court += tostring(ca5courtP).lower()
                    if 'fifth circuit' in court:
                        court = 'ca5'
                    else:
                        court = False

                    if not court:
                        # Still no luck. Ask for input, then append it to
                        # the fix file.
                        print absCaseLink
                        if BROWSER:
                            subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate()
                        court = raw_input("Unknown court. Input the court code to proceed successfully [%s]: " % saved_court)
                        court = court or saved_court
                    court_fix_file.write("%s|%s\n" % (sha1Hash, court))

            saved_court = court
            court = Court.objects.get(pk=court)
            if DEBUG >= 4:
                print "Court is: %s" % court

            # next: west_cite, docket_number and caseName. Full casename is gotten later.
            west_cite = caseLinks[j].text
            docket_number = absCaseLink.split('.')[-2]
            caseName = caseLinks[j].get('title')

            caseName, precedential_status = exceptional_cleaner(caseName)
            cite, new = hasDuplicate(caseName, west_cite, docket_number)
            if cite.caseNameShort == '':
                # No luck getting the case name
                savedCaseNameShort = check_fix_list(sha1Hash, case_name_short_dict)
                if not savedCaseNameShort:
                    print absCaseLink
                    if BROWSER:
                        subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate()
                    caseName = raw_input("Short casename: ")
                    cite.caseNameShort = trunc(caseName, 100)
                    cite.caseNameFull = caseName
                    case_name_short_fix_file.write("%s|%s\n" % (sha1Hash, caseName))
                else:
                    # We got both the values from the save files. Use 'em.
                    cite.caseNameShort = trunc(savedCaseNameShort, 100)
                    cite.caseNameFull = savedCaseNameShort

                # The slug needs to be done here, b/c it is only done automatically
                # the first time the citation is saved, and this will be
                # at least the second.
                cite.slug = slugify(trunc(cite.caseNameShort, 75))
                cite.save()

            if DEBUG >= 4:
                print "precedential_status: " + precedential_status
                print "west_cite: " + cite.west_cite
                print "caseName: " + cite.caseNameFull

            # date is kinda tricky...details here:
            # http://pleac.sourceforge.net/pleac_python/datesandtimes.html
            rawDate = caseDates[j].find('a')
            try:
                if rawDate is not None:
                    # Special cases
                    if sha1Hash == 'f0da421f117ef16223d7e61d1e4e5526036776e6':
                        date_text = 'August 28, 1980'
                    elif sha1Hash == '8cc192eaacd1c544b5e8ffbd751d9be84c311932':
                        date_text = 'August 16, 1985'
                    elif sha1Hash == 'd19bce155f72a9f981a12efabd760a35e1e7dbe7':
                        date_text = 'October 12, 1979'
                    elif sha1Hash == '9f7583cf0d46ddc9cad4e7943dd775f9e9ea99ff':
                        date_text = 'July 30, 1980'
                    elif sha1Hash == '211ea81a4ab4132483c483698d2a40f4366f5640':
                        date_text = 'November 3, 1981'
                    elif sha1Hash == 'eefb344034461e9c6912689677a32cd18381d5c2':
                        date_text = 'July 28, 1983'
                    else:
                        date_text = rawDate.text
                    try:
                        caseDate = datetime.datetime(*time.strptime(date_text, "%B, %Y")[0:5])
                    except (ValueError, TypeError):
                        caseDate = datetime.datetime(*time.strptime(date_text, "%B %d, %Y")[0:5])
                else:
                    # No value was found. Throw an exception.
                    raise ValueError
            except:
                # No date provided.
                try:
                    # Try to get it from the saved list
                    caseDate = datetime.datetime(*time.strptime(check_fix_list(sha1Hash, date_fix_dict), "%B %d, %Y")[0:5])
                except:
                    caseDate = False
                if not caseDate:
                    # Parse out the dates with debug set to false.
                    try:
                        dates = parse_dates(bodyText, False)
                    except OverflowError:
                        # Happens when we try to make a date from a very large number
                        dates = []
                    try:
                        first_date_found = dates[0]
                    except IndexError:
                        # No dates found.
                        first_date_found = False
                    if first_date_found == saved_caseDate:
                        # High likelihood of date being correct. Use it.
                        caseDate = saved_caseDate
                    else:
                        print absCaseLink
                        if BROWSER:
                            subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate()
                        print "Unknown date. Possible options are:"
                        try:
                            print "  1) %s" % saved_caseDate.strftime("%B %d, %Y")
                        except AttributeError:
                            # Happens on first iteration when saved_caseDate has no strftime attribute.
                            try:
                                saved_caseDate = dates[0]
                                print "  1) %s" % saved_caseDate.strftime(
                                    "%B %d, %Y")
                            except IndexError:
                                # Happens when dates has no values.
                                print "  No options available."
                        for k, date in enumerate(dates[0:4]):
                            if date.year >= 1900:
                                # strftime can't handle dates before 1900.
                                print "  %s) %s" % (k + 2,
                                                    date.strftime("%B %d, %Y"))
                        choice = raw_input("Enter the date or an option to proceed [1]: ")
                        choice = choice or 1
                        if str(choice) == '1':
                            # The user chose the default. Use the saved value from the last case
                            caseDate = saved_caseDate
                        elif choice in ['2', '3', '4', '5']:
                            # The user chose an option between 2 and 5. Use it.
                            caseDate = dates[int(choice) - 2]
                        else:
                            # The user typed a new date. Use it.
                            caseDate = datetime.datetime(*time.strptime(choice, "%B %d, %Y")[0:5])
                    date_fix_file.write("%s|%s\n" % (sha1Hash, caseDate.strftime("%B %d, %Y")))

            # Used during the next iteration as the default value
            saved_caseDate = caseDate

            if DEBUG >= 3:
                print "caseDate is: %s" % caseDate

            try:
                doc, created = Document.objects.get_or_create(
                    sha1=sha1Hash, court=court)
            except MultipleObjectsReturned:
                # this shouldn't happen now that we're using SHA1 as the dup
                # check, but the old data is problematic, so we must catch this.
                created = False

            if created:
                # we only do this if it's new
                doc.html = body
                doc.sha1 = sha1Hash
                doc.download_url = "http://bulk.resource.org/courts.gov/c/F2/"\
                    + str(i + 178) + "/" + caseLink
                doc.date_filed = caseDate
                doc.source = "R"

                doc.precedential_status = precedential_status
                doc.citation = cite
                doc.save()

            if not created:
                # something is afoot. Throw a big error.
                print "Duplicate found at volume " + str(i + 1) + \
                    " and row " + str(j + 1) + "!!!!"
                print "Found document %s in the database with doc id of %d!" % (doc, doc.pk)
                exit(1)

            # save our location within the volume.
            j += 1
            case_file = open('../logs/case_file.txt', 'w')
            case_file.write(str(j))
            case_file.close()

        # save the last volume completed.
        i += 1
        vol_file = open('../logs/vol_file.txt', 'w')
        vol_file.write(str(i))
        vol_file.close()

        # Clear query cache, as it presents a memory leak
        db.reset_queries()

    return 0
Exemple #14
0
def scrape_and_parse():
    """Traverses the dumps from resource.org, and puts them in the DB.

    Probably lots of ways to go about this, but I think the easiest will be the following:
     - look at the index page of all volumes, and follow all the links it has.
     - for each volume, look at its index page, and follow the link to all cases
     - for each case, collect information wisely.
     - put it all in the DB
    """

    # begin by loading up the fix files into memory
    court_fix_dict, date_fix_dict, case_name_short_dict = load_fix_files()

    results = []
    DEBUG = 4
    # Set to False to disable automatic browser usage. Else, set to the
    # command you want to run, e.g. 'firefox'
    BROWSER = False
    court_fix_file = open('../logs/f2_court_fix_file.txt', 'a')
    date_fix_file = open('../logs/f2_date_fix_file.txt', 'a')
    case_name_short_fix_file = open('../logs/f2_short_case_name_fix_file.txt', 'a')
    vol_file = open('../logs/vol_file.txt', 'r+')
    case_file = open('../logs/case_file.txt', 'r+')

    url = "file://%s/Resource.org/F2/index.html" % INSTALL_ROOT
    openedURL = urllib2.urlopen(url)
    content = openedURL.read()
    openedURL.close()
    tree = fromstring(content)

    volumeLinks = tree.xpath('//table/tbody/tr/td[1]/a')

    try:
        i = int(vol_file.readline())
    except ValueError:
        # the volume file is emtpy or otherwise failing.
        i = 0
    vol_file.close()

    if DEBUG >= 1:
        print "Number of remaining volumes is: %d" % (len(volumeLinks) - i)

    # used later, needs a default value.
    saved_caseDate = None
    saved_court = None
    while i < len(volumeLinks):
        # we iterate over every case in the volume
        volumeURL = volumeLinks[i].text + "/index.html"
        volumeURL = urljoin(url, volumeURL)
        if DEBUG >= 1:
            print "Current volumeURL is: %s" % volumeURL

        openedVolumeURL = urllib2.urlopen(volumeURL)
        content = openedVolumeURL.read()
        volumeTree = fromstring(content)
        openedVolumeURL.close()
        caseLinks = volumeTree.xpath('//table/tbody/tr/td[1]/a')
        caseDates = volumeTree.xpath('//table/tbody/tr/td[2]')
        sha1Hashes = volumeTree.xpath('//table/tbody/tr/td[3]/a')

        # The following loads a serialized placeholder from disk.
        try:
            j = int(case_file.readline())
        except ValueError:
            j = 0
        case_file.close()
        while j < len(caseLinks):
            # iterate over each case, throwing it in the DB
            if DEBUG >= 1:
                print ''
            # like the scraper, we begin with the caseLink field (relative for
            # now, not absolute)
            caseLink = caseLinks[j].get('href')

            # sha1 is easy
            sha1Hash = sha1Hashes[j].text
            if DEBUG >= 4:
                print "SHA1 is: %s" % sha1Hash

            # using the caselink from above, and the volumeURL, we can get the
            # html
            absCaseLink = urljoin(volumeURL, caseLink)
            html = urllib2.urlopen(absCaseLink).read()
            htmlTree = fromstring(html)
            bodyContents = htmlTree.xpath('//body/*[not(@id="footer")]')

            body = ""
            bodyText = ""
            for element in bodyContents:
                body += tostring(element)
                try:
                    bodyText += tostring(element, method='text')
                except UnicodeEncodeError:
                    # Happens with odd characters. Simply pass this iteration.
                    pass
            if DEBUG >= 5:
                print body
                print bodyText

            # need to figure out the court ID
            try:
                courtPs = htmlTree.xpath('//p[@class = "court"]')
                # Often the court ends up in the parties field.
                partiesPs = htmlTree.xpath("//p[@class= 'parties']")
                court = ""
                for courtP in courtPs:
                    court += tostring(courtP).lower()
                for party in partiesPs:
                    court += tostring(party).lower()
            except IndexError:
                court = check_fix_list(sha1Hash, court_fix_dict)
                if not court:
                    print absCaseLink
                    if BROWSER:
                        subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate()
                    court = raw_input("Please input court name (e.g. \"First Circuit of Appeals\"): ").lower()
                    court_fix_file.write("%s|%s\n" % (sha1Hash, court))
            if ('first' in court) or ('ca1' == court):
                court = 'ca1'
            elif ('second' in court) or ('ca2' == court):
                court = 'ca2'
            elif ('third' in court) or ('ca3' == court):
                court = 'ca3'
            elif ('fourth' in court) or ('ca4' == court):
                court = 'ca4'
            elif ('fifth' in court) or ('ca5' == court):
                court = 'ca5'
            elif ('sixth' in court) or ('ca6' == court):
                court = 'ca6'
            elif ('seventh' in court) or ('ca7' == court):
                court = 'ca7'
            elif ('eighth' in court) or ('ca8' == court):
                court = 'ca8'
            elif ('ninth' in court) or ('ca9' == court):
                court = 'ca9'
            elif ("tenth" in court) or ('ca10' == court):
                court = 'ca10'
            elif ("eleventh" in court) or ('ca11' == court):
                court = 'ca11'
            elif ('columbia' in court) or ('cadc' == court):
                court = 'cadc'
            elif ('federal' in court) or ('cafc' == court):
                court = 'cafc'
            elif ('patent' in court) or ('ccpa' == court):
                court = 'ccpa'
            elif (('emergency' in court) and ('temporary' not in court)) or ('eca' == court):
                court = 'eca'
            elif ('claims' in court) or ('uscfc' == court):
                court = 'uscfc'
            else:
                # No luck extracting the court name. Try the fix file.
                court = check_fix_list(sha1Hash, court_fix_dict)
                if not court:
                    # Not yet in the fix file. Check if it's a crazy ca5 case
                    court = ''
                    ca5courtPs = htmlTree.xpath('//p[@class = "center"]')
                    for ca5courtP in ca5courtPs:
                        court += tostring(ca5courtP).lower()
                    if 'fifth circuit' in court:
                        court = 'ca5'
                    else:
                        court = False

                    if not court:
                        # Still no luck. Ask for input, then append it to
                        # the fix file.
                        print absCaseLink
                        if BROWSER:
                            subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate()
                        court = raw_input("Unknown court. Input the court code to proceed successfully [%s]: " % saved_court)
                        court = court or saved_court
                    court_fix_file.write("%s|%s\n" % (sha1Hash, court))

            saved_court = court
            court = Court.objects.get(pk=court)
            if DEBUG >= 4:
                print "Court is: %s" % court

            # next: west_cite, docket_number and caseName. Full casename is gotten later.
            west_cite = caseLinks[j].text
            docket_number = absCaseLink.split('.')[-2]
            caseName = caseLinks[j].get('title')

            caseName, precedential_status = exceptional_cleaner(caseName)
            cite, new = hasDuplicate(caseName, west_cite, docket_number)
            if cite.caseNameShort == '':
                # No luck getting the case name
                savedCaseNameShort = check_fix_list(sha1Hash, case_name_short_dict)
                if not savedCaseNameShort:
                    print absCaseLink
                    if BROWSER:
                        subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate()
                    caseName = raw_input("Short casename: ")
                    cite.caseNameShort = trunc(caseName, 100)
                    cite.caseNameFull = caseName
                    case_name_short_fix_file.write("%s|%s\n" % (sha1Hash, caseName))
                else:
                    # We got both the values from the save files. Use 'em.
                    cite.caseNameShort = trunc(savedCaseNameShort, 100)
                    cite.caseNameFull = savedCaseNameShort

                # The slug needs to be done here, b/c it is only done automatically
                # the first time the citation is saved, and this will be
                # at least the second.
                cite.slug = trunc(slugify(cite.caseNameShort), 50)
                cite.save()

            if DEBUG >= 4:
                print "precedential_status: " + precedential_status
                print "west_cite: " + cite.west_cite
                print "docket_number: " + cite.docket_number
                print "caseName: " + cite.caseNameFull

            # date is kinda tricky...details here:
            # http://pleac.sourceforge.net/pleac_python/datesandtimes.html
            rawDate = caseDates[j].find('a')
            try:
                if rawDate is not None:
                    # Special cases
                    if sha1Hash == 'f0da421f117ef16223d7e61d1e4e5526036776e6':
                        date_text = 'August 28, 1980'
                    elif sha1Hash == '8cc192eaacd1c544b5e8ffbd751d9be84c311932':
                        date_text = 'August 16, 1985'
                    elif sha1Hash == 'd19bce155f72a9f981a12efabd760a35e1e7dbe7':
                        date_text = 'October 12, 1979'
                    elif sha1Hash == '9f7583cf0d46ddc9cad4e7943dd775f9e9ea99ff':
                        date_text = 'July 30, 1980'
                    elif sha1Hash == '211ea81a4ab4132483c483698d2a40f4366f5640':
                        date_text = 'November 3, 1981'
                    elif sha1Hash == 'eefb344034461e9c6912689677a32cd18381d5c2':
                        date_text = 'July 28, 1983'
                    else:
                        date_text = rawDate.text
                    try:
                        caseDate = datetime.datetime(*time.strptime(date_text, "%B, %Y")[0:5])
                    except ValueError, TypeError:
                        caseDate = datetime.datetime(*time.strptime(date_text, "%B %d, %Y")[0:5])
                else:
                    # No value was found. Throw an exception.
                    raise ValueError
            except:
Exemple #15
0
def get_title(self, referer_id):
    """Get the HTML title for a page, trying again if failures occur.

    Idea here is that somebody will create a new page that embeds one of our
    maps. As soon as they do, we'll get an HTTP referer sent to us, which is
    great. Unfortunately, in many cases, the HTTP referer we receive is that of
    an in progress page or similar, NOT the page that's actually live. Thus,
    what we do is try the URL over and over, until we find success.

    If a title is found, the admins are notified.

    If not, the item is deleted (this is OK, however, b/c it'll be recreated if
    it should have existed).
    """
    # Set the exponential back off in case we need it, starting at 15 minutes,
    # then 30, 60, 120...
    countdown = 15 * 60 * (2 ** self.request.retries)
    retried_exceeded = (self.request.retries >= self.max_retries)

    referer = Referer.objects.get(pk=referer_id)
    if blacklisted_url(referer.url):
        return

    try:
        r = requests.get(
            referer.url,
            headers={'User-Agent': "CourtListener"},
            verify=False,  # Integrity of a referer's referent is not important.
        )
    except MissingSchema:
        return
    except TooManyRedirects:
        return

    try:
        r.raise_for_status()
    except HTTPError as exc:
        if retried_exceeded:
            # We're not wanted here. Maybe we'll have better luck another time.
            return
        raise self.retry(exc=exc, countdown=countdown)

    html_tree = html.fromstring(r.text)
    try:
        title = getattr(html_tree.xpath('//title')[0], 'text', '')
        if title is not None:
            title = title.strip()
    except IndexError:
        title = ''

    if title:
        referer.page_title = trunc(
            title,
            referer._meta.get_field('page_title').max_length,
        )
        referer.save()

        if new_title_for_viz(referer):
            # Only send the email if we haven't seen this page title before for
            # this visualization.
            email = emails['referer_detected']
            email_body = email['body'] % (referer.url, referer.page_title, reverse(
                    'admin:visualizations_referer_change',
                    args=(referer.pk,)
            ))
            send_mail(email['subject'], email_body, email['from'],
                      email['to'])
    else:
        try:
            # Create an exception to catch.
            raise Exception("Couldn't get title from HTML")
        except Exception as exc:
            if retried_exceeded:
                # We couldn't get the title. Let it go.
                return
            raise self.retry(exc=exc, countdown=countdown)
def scrape_and_parse():
    """Traverses the bulk data from public.resource.org, and puts them in the
    DB.

    Probably lots of ways to go about this, but I think the easiest will be the following:
     - look at the index page of all volumes, and follow all the links it has.
     - for each volume, look at its index page, and follow the link to all cases
     - for each case, collect information wisely.
     - put it all in the DB
    """

    # begin by loading up the fix files into memory
    court_fix_dict, date_fix_dict, case_name_short_dict = load_fix_files()

    results = []
    DEBUG = 4
    # Set to False to disable automatic browser usage. Else, set to the
    # command you want to run, e.g. 'firefox'
    BROWSER = False
    court_fix_file = open('../logs/f2_court_fix_file.txt', 'a')
    date_fix_file = open('../logs/f2_date_fix_file.txt', 'a')
    case_name_short_fix_file = open('../logs/f2_short_case_name_fix_file.txt',
                                    'a')
    vol_file = open('../logs/vol_file.txt', 'r+')
    case_file = open('../logs/case_file.txt', 'r+')

    url = "file://%s/Resource.org/F2/index.html" % INSTALL_ROOT
    openedURL = urllib2.urlopen(url)
    content = openedURL.read()
    openedURL.close()
    tree = fromstring(content)

    volumeLinks = tree.xpath('//table/tbody/tr/td[1]/a')

    try:
        i = int(vol_file.readline())
    except ValueError:
        # the volume file is emtpy or otherwise failing.
        i = 0
    vol_file.close()

    if DEBUG >= 1:
        print "Number of remaining volumes is: %d" % (len(volumeLinks) - i)

    # used later, needs a default value.
    saved_caseDate = None
    saved_court = None
    while i < len(volumeLinks):
        # we iterate over every case in the volume
        volumeURL = volumeLinks[i].text + "/index.html"
        volumeURL = urljoin(url, volumeURL)
        if DEBUG >= 1:
            print "Current volumeURL is: %s" % volumeURL

        openedVolumeURL = urllib2.urlopen(volumeURL)
        content = openedVolumeURL.read()
        volumeTree = fromstring(content)
        openedVolumeURL.close()
        caseLinks = volumeTree.xpath('//table/tbody/tr/td[1]/a')
        caseDates = volumeTree.xpath('//table/tbody/tr/td[2]')
        sha1Hashes = volumeTree.xpath('//table/tbody/tr/td[3]/a')

        # The following loads a serialized placeholder from disk.
        try:
            j = int(case_file.readline())
        except ValueError:
            j = 0
        case_file.close()
        while j < len(caseLinks):
            # iterate over each case, throwing it in the DB
            if DEBUG >= 1:
                print ''
            # like the scraper, we begin with the caseLink field (relative for
            # now, not absolute)
            caseLink = caseLinks[j].get('href')

            # sha1 is easy
            sha1Hash = sha1Hashes[j].text
            if DEBUG >= 4:
                print "SHA1 is: %s" % sha1Hash

            # using the caselink from above, and the volumeURL, we can get the
            # html
            absCaseLink = urljoin(volumeURL, caseLink)
            html = urllib2.urlopen(absCaseLink).read()
            htmlTree = fromstring(html)
            bodyContents = htmlTree.xpath('//body/*[not(@id="footer")]')

            body = ""
            bodyText = ""
            for element in bodyContents:
                body += tostring(element)
                try:
                    bodyText += tostring(element, method='text')
                except UnicodeEncodeError:
                    # Happens with odd characters. Simply pass this iteration.
                    pass
            if DEBUG >= 5:
                print body
                print bodyText

            # need to figure out the court ID
            try:
                courtPs = htmlTree.xpath('//p[@class = "court"]')
                # Often the court ends up in the parties field.
                partiesPs = htmlTree.xpath("//p[@class= 'parties']")
                court = ""
                for courtP in courtPs:
                    court += tostring(courtP).lower()
                for party in partiesPs:
                    court += tostring(party).lower()
            except IndexError:
                court = check_fix_list(sha1Hash, court_fix_dict)
                if not court:
                    print absCaseLink
                    if BROWSER:
                        subprocess.Popen([BROWSER, absCaseLink],
                                         shell=False).communicate()
                    court = raw_input(
                        "Please input court name (e.g. \"First Circuit of Appeals\"): "
                    ).lower()
                    court_fix_file.write("%s|%s\n" % (sha1Hash, court))
            if ('first' in court) or ('ca1' == court):
                court = 'ca1'
            elif ('second' in court) or ('ca2' == court):
                court = 'ca2'
            elif ('third' in court) or ('ca3' == court):
                court = 'ca3'
            elif ('fourth' in court) or ('ca4' == court):
                court = 'ca4'
            elif ('fifth' in court) or ('ca5' == court):
                court = 'ca5'
            elif ('sixth' in court) or ('ca6' == court):
                court = 'ca6'
            elif ('seventh' in court) or ('ca7' == court):
                court = 'ca7'
            elif ('eighth' in court) or ('ca8' == court):
                court = 'ca8'
            elif ('ninth' in court) or ('ca9' == court):
                court = 'ca9'
            elif ("tenth" in court) or ('ca10' == court):
                court = 'ca10'
            elif ("eleventh" in court) or ('ca11' == court):
                court = 'ca11'
            elif ('columbia' in court) or ('cadc' == court):
                court = 'cadc'
            elif ('federal' in court) or ('cafc' == court):
                court = 'cafc'
            elif ('patent' in court) or ('ccpa' == court):
                court = 'ccpa'
            elif (('emergency' in court) and
                  ('temporary' not in court)) or ('eca' == court):
                court = 'eca'
            elif ('claims' in court) or ('uscfc' == court):
                court = 'uscfc'
            else:
                # No luck extracting the court name. Try the fix file.
                court = check_fix_list(sha1Hash, court_fix_dict)
                if not court:
                    # Not yet in the fix file. Check if it's a crazy ca5 case
                    court = ''
                    ca5courtPs = htmlTree.xpath('//p[@class = "center"]')
                    for ca5courtP in ca5courtPs:
                        court += tostring(ca5courtP).lower()
                    if 'fifth circuit' in court:
                        court = 'ca5'
                    else:
                        court = False

                    if not court:
                        # Still no luck. Ask for input, then append it to
                        # the fix file.
                        print absCaseLink
                        if BROWSER:
                            subprocess.Popen([BROWSER, absCaseLink],
                                             shell=False).communicate()
                        court = raw_input(
                            "Unknown court. Input the court code to proceed successfully [%s]: "
                            % saved_court)
                        court = court or saved_court
                    court_fix_file.write("%s|%s\n" % (sha1Hash, court))

            saved_court = court
            court = Court.objects.get(pk=court)
            if DEBUG >= 4:
                print "Court is: %s" % court

            # next: west_cite, docket_number and caseName. Full casename is gotten later.
            west_cite = caseLinks[j].text
            docket_number = absCaseLink.split('.')[-2]
            caseName = caseLinks[j].get('title')

            caseName, precedential_status = exceptional_cleaner(caseName)
            cite, new = hasDuplicate(caseName, west_cite, docket_number)
            if cite.caseNameShort == '':
                # No luck getting the case name
                savedCaseNameShort = check_fix_list(sha1Hash,
                                                    case_name_short_dict)
                if not savedCaseNameShort:
                    print absCaseLink
                    if BROWSER:
                        subprocess.Popen([BROWSER, absCaseLink],
                                         shell=False).communicate()
                    caseName = raw_input("Short casename: ")
                    cite.caseNameShort = trunc(caseName, 100)
                    cite.caseNameFull = caseName
                    case_name_short_fix_file.write("%s|%s\n" %
                                                   (sha1Hash, caseName))
                else:
                    # We got both the values from the save files. Use 'em.
                    cite.caseNameShort = trunc(savedCaseNameShort, 100)
                    cite.caseNameFull = savedCaseNameShort

                # The slug needs to be done here, b/c it is only done automatically
                # the first time the citation is saved, and this will be
                # at least the second.
                cite.slug = slugify(trunc(cite.caseNameShort, 75))
                cite.save()

            if DEBUG >= 4:
                print "precedential_status: " + precedential_status
                print "west_cite: " + cite.west_cite
                print "caseName: " + cite.caseNameFull

            # date is kinda tricky...details here:
            # http://pleac.sourceforge.net/pleac_python/datesandtimes.html
            rawDate = caseDates[j].find('a')
            try:
                if rawDate is not None:
                    # Special cases
                    if sha1Hash == 'f0da421f117ef16223d7e61d1e4e5526036776e6':
                        date_text = 'August 28, 1980'
                    elif sha1Hash == '8cc192eaacd1c544b5e8ffbd751d9be84c311932':
                        date_text = 'August 16, 1985'
                    elif sha1Hash == 'd19bce155f72a9f981a12efabd760a35e1e7dbe7':
                        date_text = 'October 12, 1979'
                    elif sha1Hash == '9f7583cf0d46ddc9cad4e7943dd775f9e9ea99ff':
                        date_text = 'July 30, 1980'
                    elif sha1Hash == '211ea81a4ab4132483c483698d2a40f4366f5640':
                        date_text = 'November 3, 1981'
                    elif sha1Hash == 'eefb344034461e9c6912689677a32cd18381d5c2':
                        date_text = 'July 28, 1983'
                    else:
                        date_text = rawDate.text
                    try:
                        caseDate = datetime.datetime(
                            *time.strptime(date_text, "%B, %Y")[0:5])
                    except ValueError, TypeError:
                        caseDate = datetime.datetime(
                            *time.strptime(date_text, "%B %d, %Y")[0:5])
                else:
                    # No value was found. Throw an exception.
                    raise ValueError
            except: