def test_anonymize(self):
        """Can we properly anonymize SSNs, EINs, and A-Numbers?"""
        # Simple cases. Anonymize them.
        self.assertEqual(anonymize('111-11-1111'), ('XXX-XX-XXXX', True))
        self.assertEqual(anonymize('11-1111111'), ('XX-XXXXXXX', True))
        self.assertEqual(anonymize('A11111111'), ('AXXXXXXXX', True))
        self.assertEqual(anonymize('A111111111'), ('AXXXXXXXX', True))

        # Starting or ending with letters isn't an SSN
        self.assertEqual(anonymize('A111-11-1111'), ('A111-11-1111', False))
        self.assertEqual(anonymize('111-11-1111A'), ('111-11-1111A', False))

        # Matches in a sentence
        self.assertEqual(
            anonymize('Term 111-11-1111 Term'),
            ('Term XXX-XX-XXXX Term', True),
        )
        self.assertEqual(
            anonymize('Term 11-1111111 Term'),
            ('Term XX-XXXXXXX Term', True),
        )
        self.assertEqual(
            anonymize('Term A11111111 Term'),
            ('Term AXXXXXXXX Term', True),
        )

        # Multiple matches
        self.assertEqual(
            anonymize("Term 111-11-1111 Term 111-11-1111 Term"),
            ('Term XXX-XX-XXXX Term XXX-XX-XXXX Term', True),
        )
Exemple #2
0
    def test_anonymize(self) -> None:
        """Can we properly anonymize SSNs, EINs, and A-Numbers?"""
        # Simple cases. Anonymize them.
        self.assertEqual(anonymize("111-11-1111"), ("XXX-XX-XXXX", True))
        self.assertEqual(anonymize("11-1111111"), ("XX-XXXXXXX", True))
        self.assertEqual(anonymize("A11111111"), ("AXXXXXXXX", True))
        self.assertEqual(anonymize("A111111111"), ("AXXXXXXXX", True))

        # Starting or ending with letters isn't an SSN
        self.assertEqual(anonymize("A111-11-1111"), ("A111-11-1111", False))
        self.assertEqual(anonymize("111-11-1111A"), ("111-11-1111A", False))

        # Matches in a sentence
        self.assertEqual(
            anonymize("Term 111-11-1111 Term"),
            ("Term XXX-XX-XXXX Term", True),
        )
        self.assertEqual(
            anonymize("Term 11-1111111 Term"), ("Term XX-XXXXXXX Term", True)
        )
        self.assertEqual(
            anonymize("Term A11111111 Term"), ("Term AXXXXXXXX Term", True)
        )

        # Multiple matches
        self.assertEqual(
            anonymize("Term 111-11-1111 Term 111-11-1111 Term"),
            ("Term XXX-XX-XXXX Term XXX-XX-XXXX Term", True),
        )
Exemple #3
0
    def test_anonymize(self):
        """Can we properly anonymize SSNs, EINs, and A-Numbers?"""
        # Simple cases. Anonymize them.
        self.assertEqual(anonymize('111-11-1111'), ('XXX-XX-XXXX', True))
        self.assertEqual(anonymize('11-1111111'), ('XX-XXXXXXX', True))
        self.assertEqual(anonymize('A11111111'), ('AXXXXXXXX', True))
        self.assertEqual(anonymize('A111111111'), ('AXXXXXXXX', True))

        # Starting or ending with letters isn't an SSN
        self.assertEqual(anonymize('A111-11-1111'), ('A111-11-1111', False))
        self.assertEqual(anonymize('111-11-1111A'), ('111-11-1111A', False))

        # Matches in a sentence
        self.assertEqual(
            anonymize('Term 111-11-1111 Term'),
            ('Term XXX-XX-XXXX Term', True),
        )
        self.assertEqual(
            anonymize('Term 11-1111111 Term'),
            ('Term XX-XXXXXXX Term', True),
        )
        self.assertEqual(
            anonymize('Term A11111111 Term'),
            ('Term AXXXXXXXX Term', True),
        )

        # Multiple matches
        self.assertEqual(
            anonymize("Term 111-11-1111 Term 111-11-1111 Term"),
            ('Term XXX-XX-XXXX Term XXX-XX-XXXX Term', True),
        )
Exemple #4
0
def extract_doc_content(pk, callback=None, citation_countdown=0):
    """
    Given a document, we extract it, sniffing its extension, then store its
    contents in the database.  Finally, we asynchronously find citations in
    the document content and match them to other documents.

    TODO: this implementation cannot be distributed due to using local paths.
    """
    opinion = Opinion.objects.get(pk=pk)

    path = opinion.local_path.path

    extension = path.split('.')[-1]
    if extension == 'doc':
        content, err = extract_from_doc(path, DEVNULL)
    elif extension == 'html':
        content, err = extract_from_html(path)
    elif extension == 'pdf':
        opinion, content, err = extract_from_pdf(opinion, path, DEVNULL,
                                                 callback)
    elif extension == 'txt':
        content, err = extract_from_txt(path)
    elif extension == 'wpd':
        opinion, content, err = extract_from_wpd(opinion, path, DEVNULL)
    else:
        print(
            '*****Unable to extract content due to unknown extension: %s '
            'on opinion: %s****' % (extension, opinion))
        return 2

    if extension in ['html', 'wpd']:
        opinion.html, blocked = anonymize(content)
    else:
        opinion.plain_text, blocked = anonymize(content)

    if blocked:
        opinion.cluster.blocked = True
        opinion.cluster.date_blocked = now()

    if err:
        print("****Error extracting text from %s: %s****" %
              (extension, opinion))
        return opinion

    try:
        if citation_countdown == 0:
            # No waiting around. Save to the database now, but don't bother
            # with the index yet because citations are being done imminently.
            opinion.cluster.save(index=False)
            opinion.save(index=False)
        else:
            # Save to the index now, citations come later, commit comes
            # according to schedule
            opinion.cluster.save(index=False)
            opinion.save(index=True)
    except Exception, e:
        print "****Error saving text to the db for: %s****" % opinion
        print traceback.format_exc()
        return opinion
Exemple #5
0
def extract_doc_content(pk, callback=None, citation_countdown=0):
    """
    Given a document, we extract it, sniffing its extension, then store its
    contents in the database.  Finally, we asynchronously find citations in
    the document content and match them to other documents.

    TODO: this implementation cannot be distributed due to using local paths.
    """
    opinion = Opinion.objects.get(pk=pk)

    path = opinion.local_path.path

    extension = path.split('.')[-1]
    if extension == 'doc':
        content, err = extract_from_doc(path, DEVNULL)
    elif extension == 'html':
        content, err = extract_from_html(path)
    elif extension == 'pdf':
        opinion, content, err = extract_from_pdf(opinion, path, DEVNULL, callback)
    elif extension == 'txt':
        content, err = extract_from_txt(path)
    elif extension == 'wpd':
        opinion, content, err = extract_from_wpd(opinion, path, DEVNULL)
    else:
        print ('*****Unable to extract content due to unknown extension: %s '
               'on opinion: %s****' % (extension, opinion))
        return 2

    if extension in ['html', 'wpd']:
        opinion.html, blocked = anonymize(content)
    else:
        opinion.plain_text, blocked = anonymize(content)

    if blocked:
        opinion.cluster.blocked = True
        opinion.cluster.date_blocked = now()

    if err:
        print ("****Error extracting text from %s: %s****" %
               (extension, opinion))
        return opinion

    try:
        if citation_countdown == 0:
            # No waiting around. Save to the database now, but don't bother
            # with the index yet because citations are being done imminently.
            opinion.cluster.save(index=False)
            opinion.save(index=False)
        else:
            # Save to the index now, citations come later, commit comes
            # according to schedule
            opinion.cluster.save(index=False)
            opinion.save(index=True)
    except Exception, e:
        print "****Error saving text to the db for: %s****" % opinion
        print traceback.format_exc()
        return opinion
Exemple #6
0
def extract_recap_pdf(pk, skip_ocr=False):
    doc = RECAPDocument.objects.get(pk=pk)
    path = doc.filepath_local.path
    process = make_pdftotext_process(path)
    content, err = process.communicate()

    if needs_ocr(content):
        if not skip_ocr:
            # probably an image PDF. Send it to OCR.
            success, content = extract_by_ocr(path)
            if success:
                doc.ocr_status = RECAPDocument.OCR_COMPLETE
            elif content == u"" or not success:
                content = u"Unable to extract document content."
                doc.ocr_status = RECAPDocument.OCR_FAILED
        else:
            content = u""
            doc.ocr_status = RECAPDocument.OCR_NEEDED
    else:
        doc.ocr_status = RECAPDocument.OCR_UNNECESSARY

    doc.plain_text, _ = anonymize(content)
    doc.save()

    return path
def import_resource_org_item(case_location):
    """Using the path to a case, import it, gathering all needed meta data.

    Path is any valid URI that the requests library can handle.
    """
    def get_file(location):
        if location.startswith('/'):
            with open(location) as f:
                r = requests.Session()
                r.content = f.read()
        else:
            r = requests.get(location)
        return fromstring(r.content), get_clean_body_content(r.content)

    # Get trees and text for the opinion itself and for the index page
    # that links to it. Each has useful data.
    case_tree, case_text = get_file(case_location)
    vol_location = case_location.rsplit('/', 1)[-2] + '/index.html'
    vol_tree, vol_text = get_file(vol_location)

    html, blocked = anonymize(get_case_body(case_tree))

    case_location_relative = case_location.rsplit('/', 1)[1]
    case_name, status = get_case_name_and_status(
        vol_tree, case_location_relative)
    docket = Docket(
        docket_number=get_docket_number(case_location),
        court=Court.objects.get(pk=get_court_id(case_tree)),
        case_name=case_name,
    )
    doc = Document(
        case_name=case_name,
        federal_cite_one=get_west_cite(vol_tree, case_location_relative),
        date_filed=get_date_filed(vol_tree, case_location_relative),
        source='R',
        sha1=hashlib.sha1(case_text).hexdigest(),
        docket=docket,
        download_url=case_location,
        html=html,
        precedential_status=status,
    )
    if blocked:
        doc.blocked = True
        docket.blocked = True
        doc.date_blocked = datetime.date.today()
        docket.date_blocked = datetime.date.today()

    docket.save()
    doc.docket = docket
    doc.save()

    # Update the citation graph
    from cl.citations.tasks import update_document_by_id
    update_document_by_id(doc.pk)

    return doc
Exemple #8
0
def import_resource_org_item(case_location):
    """Using the path to a case, import it, gathering all needed meta data.

    Path is any valid URI that the requests library can handle.
    """
    def get_file(location):
        if location.startswith('/'):
            with open(location) as f:
                r = requests.Session()
                r.content = f.read()
        else:
            r = requests.get(location)
        return fromstring(r.content), get_clean_body_content(r.content)

    # Get trees and text for the opinion itself and for the index page
    # that links to it. Each has useful data.
    case_tree, case_text = get_file(case_location)
    vol_location = case_location.rsplit('/', 1)[-2] + '/index.html'
    vol_tree, vol_text = get_file(vol_location)

    html, blocked = anonymize(get_case_body(case_tree))

    case_location_relative = case_location.rsplit('/', 1)[1]
    case_name, status = get_case_name_and_status(vol_tree,
                                                 case_location_relative)
    docket = Docket(
        docket_number=get_docket_number(case_location),
        court=Court.objects.get(pk=get_court_id(case_tree)),
        case_name=case_name,
    )
    doc = Document(
        case_name=case_name,
        federal_cite_one=get_west_cite(vol_tree, case_location_relative),
        date_filed=get_date_filed(vol_tree, case_location_relative),
        source='R',
        sha1=hashlib.sha1(case_text).hexdigest(),
        docket=docket,
        download_url=case_location,
        html=html,
        precedential_status=status,
    )
    if blocked:
        doc.blocked = True
        docket.blocked = True
        doc.date_blocked = datetime.date.today()
        docket.date_blocked = datetime.date.today()

    docket.save()
    doc.docket = docket
    doc.save()

    # Update the citation graph
    from cl.citations.tasks import update_document_by_id
    update_document_by_id(doc.pk)

    return doc
Exemple #9
0
def extract_recap_pdf(
    pks: Union[int, List[int]],
    skip_ocr: bool = False,
    check_if_needed: bool = True,
) -> List[int]:
    """Extract the contents from a RECAP PDF if necessary."""
    if not is_iter(pks):
        pks = [pks]

    processed = []
    for pk in pks:
        rd = RECAPDocument.objects.get(pk=pk)
        if check_if_needed and not rd.needs_extraction:
            # Early abort if the item doesn't need extraction and the user
            # hasn't disabled early abortion.
            processed.append(pk)
            continue

        with NamedTemporaryFile(
            prefix="extract_file_",
            suffix=".pdf",
            buffering=0,  # Make sure it's on disk when we try to use it
        ) as tmp:
            tmp.write(rd.filepath_local.read())
            process = make_pdftotext_process(tmp.name)
            content, err = process.communicate()
            content = content.decode()

            if needs_ocr(content):
                if not skip_ocr:
                    # probably an image PDF. Send it to OCR.
                    success, content = extract_by_ocr(tmp.name)
                    if success:
                        rd.ocr_status = RECAPDocument.OCR_COMPLETE
                    elif content == "" or not success:
                        content = "Unable to extract document content."
                        rd.ocr_status = RECAPDocument.OCR_FAILED
                else:
                    content = ""
                    rd.ocr_status = RECAPDocument.OCR_NEEDED
            else:
                rd.ocr_status = RECAPDocument.OCR_UNNECESSARY

        rd.plain_text, _ = anonymize(content)
        # Do not do indexing here. Creates race condition in celery.
        rd.save(index=False, do_extraction=False)
        processed.append(pk)

    return processed
Exemple #10
0
def extract_recap_pdf(pks, skip_ocr=False, check_if_needed=True):
    """Extract the contents from a RECAP PDF if necessary."""
    if not is_iter(pks):
        pks = [pks]

    processed = []
    for pk in pks:
        rd = RECAPDocument.objects.get(pk=pk)
        if check_if_needed and not rd.needs_extraction:
            # Early abort if the item doesn't need extraction and the user
            # hasn't disabled early abortion.
            processed.append(pk)
            continue
        path = rd.filepath_local.path
        process = make_pdftotext_process(path)
        content, err = process.communicate()
        content = content.decode()

        if needs_ocr(content):
            if not skip_ocr:
                # probably an image PDF. Send it to OCR.
                success, content = extract_by_ocr(path)
                if success:
                    rd.ocr_status = RECAPDocument.OCR_COMPLETE
                elif content == "" or not success:
                    content = "Unable to extract document content."
                    rd.ocr_status = RECAPDocument.OCR_FAILED
            else:
                content = ""
                rd.ocr_status = RECAPDocument.OCR_NEEDED
        else:
            rd.ocr_status = RECAPDocument.OCR_UNNECESSARY

        rd.plain_text, _ = anonymize(content)
        # Do not do indexing here. Creates race condition in celery.
        rd.save(index=False, do_extraction=False)
        processed.append(pk)

    return processed
Exemple #11
0
def extract_recap_pdf(pks, skip_ocr=False, check_if_needed=True):
    """Extract the contents from a RECAP PDF if necessary."""
    if not is_iter(pks):
        pks = [pks]

    processed = []
    for pk in pks:
        rd = RECAPDocument.objects.get(pk=pk)
        if check_if_needed and not rd.needs_extraction:
            # Early abort if the item doesn't need extraction and the user
            # hasn't disabled early abortion.
            processed.append(pk)
            continue
        path = rd.filepath_local.path
        process = make_pdftotext_process(path)
        content, err = process.communicate()

        if needs_ocr(content):
            if not skip_ocr:
                # probably an image PDF. Send it to OCR.
                success, content = extract_by_ocr(path)
                if success:
                    rd.ocr_status = RECAPDocument.OCR_COMPLETE
                elif content == u'' or not success:
                    content = u'Unable to extract document content.'
                    rd.ocr_status = RECAPDocument.OCR_FAILED
            else:
                content = u''
                rd.ocr_status = RECAPDocument.OCR_NEEDED
        else:
            rd.ocr_status = RECAPDocument.OCR_UNNECESSARY

        rd.plain_text, _ = anonymize(content)
        # Do not do indexing here. Creates race condition in celery.
        rd.save(index=False, do_extraction=False)
        processed.append(pk)

    return processed
def main():
    parser = argparse.ArgumentParser(
        description='Import the corpus provided by lawbox')
    parser.add_argument(
        '-s',
        '--simulate',
        default=False,
        required=False,
        action='store_true',
        help='Run the code in simulate mode, making no permanent changes.')
    parser.add_argument(
        '-d',
        '--dir',
        type=readable_dir,
        help='The directory where the lawbox bulk data can be found.')
    parser.add_argument(
        '-f',
        '--file',
        type=str,
        default="index.txt",
        required=False,
        dest="file_name",
        help="The file that has all the URLs to import, one per line.")
    parser.add_argument(
        '-l',
        '--line',
        type=int,
        default=1,
        required=False,
        help=
        'If provided, this will be the line number in the index file where we resume processing.'
    )
    parser.add_argument(
        '-r',
        '--resume',
        default=False,
        required=False,
        action='store_true',
        help='Use the saved marker to resume operation where it last failed.')
    parser.add_argument('-x',
                        '--random',
                        default=False,
                        required=False,
                        action='store_true',
                        help='Pick cases randomly rather than serially.')
    parser.add_argument(
        '-m',
        '--marker',
        type=str,
        default='lawbox_progress_marker.txt',
        required=False,
        help=
        "The name of the file that tracks the progress (useful if multiple versions run at same time)"
    )
    parser.add_argument('-e',
                        '--end',
                        type=int,
                        required=False,
                        default=2000000,
                        help="An optional endpoint for an importer.")
    args = parser.parse_args()

    if args.dir:

        def case_generator(dir_root):
            """Yield cases, one by one to the importer by recursing and iterating the import directory"""
            for root, dirnames, filenames in os.walk(dir_root):
                for filename in fnmatch.filter(filenames, '*'):
                    yield os.path.join(root, filename)

        cases = case_generator(args.root)
        i = 0
    else:

        def generate_random_line(file_name):
            while True:
                total_bytes = os.stat(file_name).st_size
                random_point = random.randint(0, total_bytes)
                f = open(file_name)
                f.seek(random_point)
                f.readline()  # skip this line to clear the partial line
                yield f.readline().strip()

        def case_generator(line_number):
            """Yield cases from the index file."""
            enumerated_line_number = line_number - 1  # The enumeration is zero-index, but files are one-index.
            index_file = open(args.file_name)
            for i, line in enumerate(index_file):
                if i >= enumerated_line_number:
                    yield line.strip()

        if args.random:
            cases = generate_random_line(args.file_name)
            i = 0
        elif args.resume:
            with open(args.marker) as marker:
                resume_point = int(marker.read().strip())
            cases = case_generator(resume_point)
            i = resume_point
        else:
            cases = case_generator(args.line)
            i = args.line

    for case_path in cases:
        if i % 1000 == 0:
            db.reset_queries()  # Else we leak memory when DEBUG is True

        if 'counter' in DEBUG:  # and i % 1000 == 0:
            log_print("\n%s: Doing case (%s): file://%s" %
                      (datetime.datetime.now(), i, case_path))
        try:
            doc = import_law_box_case(case_path)
            duplicates = find_duplicates(doc, case_path)
            if not args.simulate:
                if len(duplicates) == 0:
                    doc.html_lawbox, blocked = anonymize(doc.html)
                    doc.html = ''
                    if blocked:
                        doc.blocked = True
                        doc.date_blocked = now()
                        # Save nothing to the index for now (it'll get done
                        # when we find citations)
                    doc.save(index=False)
                if len(duplicates) == 1:
                    dup_helpers.merge_cases_simple(doc, duplicates[0])
                if len(duplicates) > 1:
                    # complex_merge
                    if 'log_multimerge' in DEBUG:
                        with open('index_multimerge.txt', 'a') as log:
                            log.write('%s\n' % case_path)
            if args.resume:
                # Don't change the progress marker unless you're in resume mode
                with open(args.marker, 'w') as marker:
                    marker.write(str(i +
                                     1))  # Files are one-index, not zero-index
            with open('lawbox_fix_file.pkl', 'wb') as fix_file:
                pickle.dump(fixes, fix_file)
            i += 1
            if i == args.end:
                log_print(
                    "Hit the endpoint after importing number %s. Breaking." %
                    i)
                break
        except Exception, err:
            log_print(traceback.format_exc())
            exit(1)
Exemple #13
0
def update_docket_appellate_metadata(d, docket_data):
    """Update the metadata specific to appellate cases."""
    if not any([
            docket_data.get('originating_court_information'),
            docket_data.get('appeal_from'),
            docket_data.get('panel')
    ]):
        # Probably not appellate.
        return d, None

    d.panel_str = ', '.join(docket_data.get('panel', [])) or d.panel_str
    d.appellate_fee_status = docket_data.get('fee_status',
                                             '') or d.appellate_fee_status
    d.appellate_case_type_information = docket_data.get(
        'case_type_information', '') or d.appellate_case_type_information
    d.appeal_from_str = docket_data.get('appeal_from', '') or d.appeal_from_str

    # Do originating court information dict
    og_info = docket_data.get('originating_court_information')
    if not og_info:
        return d, None

    if og_info.get('court_id'):
        cl_id = map_pacer_to_cl_id(og_info['court_id'])
        if Court.objects.filter(pk=cl_id).exists():
            # Ensure the court exists. Sometimes PACER does weird things,
            # like in 14-1743 in CA3, where it says the court_id is 'uspci'.
            # If we don't do this check, the court ID could be invalid, and
            # our whole save of the docket fails.
            d.appeal_from_id = cl_id

    if d.originating_court_information:
        d_og_info = d.originating_court_information
    else:
        d_og_info = OriginatingCourtInformation()

    # Ensure we don't share A-Numbers, which can sometimes be in the docket
    # number field.
    docket_number = og_info.get('docket_number', '') or d_og_info.docket_number
    docket_number, _ = anonymize(docket_number)
    d_og_info.docket_number = docket_number
    d_og_info.court_reporter = og_info.get('court_reporter',
                                           '') or d_og_info.court_reporter
    d_og_info.date_disposed = og_info.get(
        'date_disposed') or d_og_info.date_disposed
    d_og_info.date_filed = og_info.get('date_filed') or d_og_info.date_filed
    d_og_info.date_judgment = og_info.get(
        'date_judgment') or d_og_info.date_judgment
    d_og_info.date_judgment_eod = og_info.get(
        'date_judgment_eod') or d_og_info.date_judgment_eod
    d_og_info.date_filed_noa = og_info.get(
        'date_filed_noa') or d_og_info.date_filed_noa
    d_og_info.date_received_coa = og_info.get(
        'date_received_coa') or d_og_info.date_received_coa
    d_og_info.assigned_to_str = og_info.get(
        'assigned_to') or d_og_info.assigned_to_str
    d_og_info.ordering_judge_str = og_info.get(
        'ordering_judge') or d_og_info.ordering_judge_str

    if not all([d.appeal_from_id, d_og_info.date_filed]):
        # Can't do judge lookups. Call it quits.
        return d, d_og_info

    if og_info.get('assigned_to'):
        judges = get_candidate_judges(og_info['assigned_to'], d.appeal_from_id,
                                      d_og_info.date_filed)
        if judges is not None and len(judges) == 1:
            d_og_info.assigned_to = judges[0]

    if og_info.get('ordering_judge'):
        judges = get_candidate_judges(og_info['ordering_judge'],
                                      d.appeal_from_id, d_og_info.date_filed)
        if judges is not None and len(judges) == 1:
            d_og_info.ordering_judge = judges[0]

    return d, d_og_info
Exemple #14
0
def extract_doc_content(pk, do_ocr=False, citation_jitter=False):
    """
    Given an opinion PK, we extract it, sniffing its extension, then store its
    contents in the database.  Finally, we asynchronously find citations in
    the document content and match them to other documents.

    This implementation uses local paths.

    :param pk: The opinion primary key to work on
    :param do_ocr: Whether the PDF converting function should use OCR
    :param citation_jitter: Whether to apply jitter before running the citation
    parsing code. This can be useful do spread these tasks out when doing a
    larger scrape.
    """
    opinion = Opinion.objects.get(pk=pk)

    path = opinion.local_path.path

    extension = path.split(".")[-1]
    if extension == "doc":
        content, err = extract_from_doc(path)
    elif extension == "docx":
        content, err = extract_from_docx(path)
    elif extension == "html":
        content, err = extract_from_html(path)
    elif extension == "pdf":
        content, err = extract_from_pdf(path, opinion, do_ocr)
    elif extension == "txt":
        content, err = extract_from_txt(path)
    elif extension == "wpd":
        content, err = extract_from_wpd(path, opinion)
    else:
        print("*****Unable to extract content due to unknown extension: %s "
              "on opinion: %s****" % (extension, opinion))
        return

    assert isinstance(
        content, str), "content must be of type str, not %s" % type(content)

    # Do page count, if possible
    opinion.page_count = get_page_count(path, extension)

    # Do blocked status
    if extension in ["html", "wpd"]:
        opinion.html, blocked = anonymize(content)
    else:
        opinion.plain_text, blocked = anonymize(content)
    if blocked:
        opinion.cluster.blocked = True
        opinion.cluster.date_blocked = now()

    update_document_from_text(opinion)

    if err:
        print(err)
        print("****Error extracting text from %s: %s****" %
              (extension, opinion))
        return

    # Save item, and index Solr if needed.
    # noinspection PyBroadException
    try:
        opinion.cluster.docket.save()
        opinion.cluster.save(index=False)
        if not citation_jitter:
            # No waiting around. Save to the database now, but don't bother
            # with the index yet because citations are being done imminently.
            opinion.save(index=False)
        else:
            # Save to the index now, citations come later, commit comes
            # according to schedule
            opinion.save(index=True)
    except Exception:
        print("****Error saving text to the db for: %s****\n%s" %
              (opinion, traceback.format_exc()))
        return

    # Identify and link citations within the document content
    find_citations_for_opinion_by_pks.apply_async(
        ([opinion.pk], ), countdown=random.randint(0, 3600))
Exemple #15
0
def merge_cases_simple(new, target_id):
    """Add `new` to the database, merging with target_id

     Merging is done by picking the best fields from each item.
    """
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # !!  THIS CODE IS OUT OF DATE AND UNMAINTAINED. FEEL FREE TO FIX IT, BUT !!
    # !!                 DO NOT TRUST IT IN ITS CURRENT STATE.                !!
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    target = OpinionCluster.objects.get(pk=target_id)
    print "Merging %s with" % new.case_name
    print "        %s" % target.case_name

    cached_source = target.source  # Original value is needed below.
    if target.source == 'C':
        target.source = 'LC'
    elif target.source == 'R':
        target.source = 'LR'
    elif target.source == 'CR':
        target.source = 'LCR'

    # Add the URL if it's not a court one, replacing public.resource.org's
    # info in some cases.
    if cached_source == 'R':
        target.download_url = new.download_url

    # Recreate the slug from the new case name (this changes the URL, but the
    # old will continue working)
    target.slug = slugify(trunc(new.case_name, 75))

    # Take the case name from the new item; they tend to be pretty good
    target.case_name = new.case_name

    # Add the docket number if the old doesn't exist, but keep the old if one
    # does.
    if not target.docket.docket_number:
        target.docket.docket_number = new.docket.docket_number

    # Get the citations from the new item (ditch the old).
    target.federal_cite_one = new.federal_cite_one
    target.federal_cite_two = new.federal_cite_two
    target.federal_cite_three = new.federal_cite_three
    target.state_cite_one = new.state_cite_one
    target.state_cite_two = new.state_cite_two
    target.state_cite_three = new.state_cite_three
    target.state_cite_regional = new.state_cite_regional
    target.specialty_cite_one = new.specialty_cite_one
    target.scotus_early_cite = new.scotus_early_cite
    target.lexis_cite = new.lexis_cite
    target.westlaw_cite = new.westlaw_cite
    target.neutral_cite = new.neutral_cite

    # Add judge information if lacking. New is dirty, but better than none.
    if not target.judges:
        target.judges = new.judges

    # Add the text.
    target.html_lawbox, blocked = anonymize(new.html)
    if blocked:
        target.blocked = True
        target.date_blocked = now()

    target.extracted_by_ocr = False  # No longer true for any LB case.
Exemple #16
0
def extract_doc_content(pk, callback=None, citation_countdown=0):
    """
    Given a document, we extract it, sniffing its extension, then store its
    contents in the database.  Finally, we asynchronously find citations in
    the document content and match them to other documents.

    TODO: this implementation cannot be distributed due to using local paths.
    """
    opinion = Opinion.objects.get(pk=pk)

    path = opinion.local_path.path

    extension = path.split(".")[-1]
    if extension == "doc":
        content, err = extract_from_doc(path)
    elif extension == "html":
        content, err = extract_from_html(path)
    elif extension == "pdf":
        opinion, content, err = extract_from_pdf(opinion, path, callback)
    elif extension == "txt":
        content, err = extract_from_txt(path)
    elif extension == "wpd":
        opinion, content, err = extract_from_wpd(opinion, path)
    else:
        print (
            "*****Unable to extract content due to unknown extension: %s " "on opinion: %s****" % (extension, opinion)
        )
        return 2

    # Do page count, if possible
    opinion.page_count = get_page_count(path, extension)

    # Do blocked status
    if extension in ["html", "wpd"]:
        opinion.html, blocked = anonymize(content)
    else:
        opinion.plain_text, blocked = anonymize(content)
    if blocked:
        opinion.cluster.blocked = True
        opinion.cluster.date_blocked = now()

    if err:
        print ("****Error extracting text from %s: %s****" % (extension, opinion))
        return opinion

    # Save item, and index Solr if needed.
    try:
        if citation_countdown == 0:
            # No waiting around. Save to the database now, but don't bother
            # with the index yet because citations are being done imminently.
            opinion.cluster.save(index=False)
            opinion.save(index=False)
        else:
            # Save to the index now, citations come later, commit comes
            # according to schedule
            opinion.cluster.save(index=False)
            opinion.save(index=True)
    except Exception:
        print "****Error saving text to the db for: %s****" % opinion
        print traceback.format_exc()
        return opinion

    # Identify and link citations within the document content
    update_document_by_id.apply_async((opinion.pk,), countdown=citation_countdown)

    return opinion
def merge_cases_simple(new, target_id):
    """Add `new` to the database, merging with target_id

     Merging is done by picking the best fields from each item.
    """
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # !!  THIS CODE IS OUT OF DATE AND UNMAINTAINED. FEEL FREE TO FIX IT, BUT !!
    # !!                 DO NOT TRUST IT IN ITS CURRENT STATE.                !!
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    target = OpinionCluster.objects.get(pk=target_id)
    print "Merging %s with" % new.case_name
    print "        %s" % target.case_name

    cached_source = target.source  # Original value is needed below.
    if target.source == 'C':
        target.source = 'LC'
    elif target.source == 'R':
        target.source = 'LR'
    elif target.source == 'CR':
        target.source = 'LCR'

    # Add the URL if it's not a court one, replacing public.resource.org's
    # info in some cases.
    if cached_source == 'R':
        target.download_url = new.download_url

    # Recreate the slug from the new case name (this changes the URL, but the
    # old will continue working)
    target.slug = slugify(trunc(new.case_name, 75))

    # Take the case name from the new item; they tend to be pretty good
    target.case_name = new.case_name

    # Add the docket number if the old doesn't exist, but keep the old if one
    # does.
    if not target.docket.docket_number:
        target.docket.docket_number = new.docket.docket_number

    # Get the citations from the new item (ditch the old).
    target.federal_cite_one = new.federal_cite_one
    target.federal_cite_two = new.federal_cite_two
    target.federal_cite_three = new.federal_cite_three
    target.state_cite_one = new.state_cite_one
    target.state_cite_two = new.state_cite_two
    target.state_cite_three = new.state_cite_three
    target.state_cite_regional = new.state_cite_regional
    target.specialty_cite_one = new.specialty_cite_one
    target.scotus_early_cite = new.scotus_early_cite
    target.lexis_cite = new.lexis_cite
    target.westlaw_cite = new.westlaw_cite
    target.neutral_cite = new.neutral_cite

    # Add judge information if lacking. New is dirty, but better than none.
    if not target.judges:
        target.judges = new.judges

    # Add the text.
    target.html_lawbox, blocked = anonymize(new.html)
    if blocked:
        target.blocked = True
        target.date_blocked = now()

    target.extracted_by_ocr = False  # No longer true for any LB case.
def main():
    parser = argparse.ArgumentParser(
        description='Import the corpus provided by lawbox')
    parser.add_argument('-s', '--simulate', default=False, required=False,
                        action='store_true',
                        help='Run the code in simulate mode, making no permanent changes.')
    parser.add_argument('-d', '--dir', type=readable_dir,
                        help='The directory where the lawbox bulk data can be found.')
    parser.add_argument('-f', '--file', type=str, default="index.txt",
                        required=False, dest="file_name",
                        help="The file that has all the URLs to import, one per line.")
    parser.add_argument('-l', '--line', type=int, default=1, required=False,
                        help='If provided, this will be the line number in the index file where we resume processing.')
    parser.add_argument('-r', '--resume', default=False, required=False,
                        action='store_true',
                        help='Use the saved marker to resume operation where it last failed.')
    parser.add_argument('-x', '--random', default=False, required=False,
                        action='store_true',
                        help='Pick cases randomly rather than serially.')
    parser.add_argument('-m', '--marker', type=str,
                        default='lawbox_progress_marker.txt', required=False,
                        help="The name of the file that tracks the progress (useful if multiple versions run at same time)")
    parser.add_argument('-e', '--end', type=int, required=False,
                        default=2000000,
                        help="An optional endpoint for an importer.")
    args = parser.parse_args()

    if args.dir:
        def case_generator(dir_root):
            """Yield cases, one by one to the importer by recursing and iterating the import directory"""
            for root, dirnames, filenames in os.walk(dir_root):
                for filename in fnmatch.filter(filenames, '*'):
                    yield os.path.join(root, filename)

        cases = case_generator(args.root)
        i = 0
    else:
        def generate_random_line(file_name):
            while True:
                total_bytes = os.stat(file_name).st_size
                random_point = random.randint(0, total_bytes)
                f = open(file_name)
                f.seek(random_point)
                f.readline()  # skip this line to clear the partial line
                yield f.readline().strip()

        def case_generator(line_number):
            """Yield cases from the index file."""
            enumerated_line_number = line_number - 1  # The enumeration is zero-index, but files are one-index.
            index_file = open(args.file_name)
            for i, line in enumerate(index_file):
                if i >= enumerated_line_number:
                    yield line.strip()

        if args.random:
            cases = generate_random_line(args.file_name)
            i = 0
        elif args.resume:
            with open(args.marker) as marker:
                resume_point = int(marker.read().strip())
            cases = case_generator(resume_point)
            i = resume_point
        else:
            cases = case_generator(args.line)
            i = args.line

    for case_path in cases:
        if i % 1000 == 0:
            db.reset_queries()  # Else we leak memory when DEBUG is True

        if 'counter' in DEBUG:  # and i % 1000 == 0:
            log_print("\n%s: Doing case (%s): file://%s" % (
                datetime.datetime.now(), i, case_path))
        try:
            doc = import_law_box_case(case_path)
            duplicates = find_duplicates(doc, case_path)
            if not args.simulate:
                if len(duplicates) == 0:
                    doc.html_lawbox, blocked = anonymize(doc.html)
                    doc.html = ''
                    if blocked:
                        doc.blocked = True
                        doc.date_blocked = now()
                        # Save nothing to the index for now (it'll get done
                        # when we find citations)
                    doc.save(index=False)
                if len(duplicates) == 1:
                    dup_helpers.merge_cases_simple(doc, duplicates[0])
                if len(duplicates) > 1:
                    # complex_merge
                    if 'log_multimerge' in DEBUG:
                        with open('index_multimerge.txt', 'a') as log:
                            log.write('%s\n' % case_path)
            if args.resume:
                # Don't change the progress marker unless you're in resume mode
                with open(args.marker, 'w') as marker:
                    marker.write(
                        str(i + 1))  # Files are one-index, not zero-index
            with open('lawbox_fix_file.pkl', 'wb') as fix_file:
                pickle.dump(fixes, fix_file)
            i += 1
            if i == args.end:
                log_print(
                    "Hit the endpoint after importing number %s. Breaking." % i)
                break
        except Exception, err:
            log_print(traceback.format_exc())
            exit(1)
Exemple #19
0
def extract_doc_content(pk, do_ocr=False, citation_jitter=False):
    """
    Given an opinion PK, we extract it, sniffing its extension, then store its
    contents in the database.  Finally, we asynchronously find citations in
    the document content and match them to other documents.

    This implementation uses local paths.

    :param pk: The opinion primary key to work on
    :param do_ocr: Whether the PDF converting function should use OCR
    :param citation_jitter: Whether to apply jitter before running the citation
    parsing code. This can be useful do spread these tasks out when doing a
    larger scrape.
    """
    opinion = Opinion.objects.get(pk=pk)

    path = opinion.local_path.path

    extension = path.split('.')[-1]
    if extension == 'doc':
        content, err = extract_from_doc(path)
    elif extension == 'docx':
        content, err = extract_from_docx(path)
    elif extension == 'html':
        content, err = extract_from_html(path)
    elif extension == 'pdf':
        content, err = extract_from_pdf(path, opinion, do_ocr)
    elif extension == 'txt':
        content, err = extract_from_txt(path)
    elif extension == 'wpd':
        content, err = extract_from_wpd(path, opinion)
    else:
        print ('*****Unable to extract content due to unknown extension: %s '
               'on opinion: %s****' % (extension, opinion))
        return

    # Do page count, if possible
    opinion.page_count = get_page_count(path, extension)

    # Do blocked status
    if extension in ['html', 'wpd']:
        opinion.html, blocked = anonymize(content)
    else:
        opinion.plain_text, blocked = anonymize(content)
    if blocked:
        opinion.cluster.blocked = True
        opinion.cluster.date_blocked = now()

    if err:
        print ("****Error extracting text from %s: %s****" %
               (extension, opinion))
        return

    # Save item, and index Solr if needed.
    # noinspection PyBroadException
    try:
        if not citation_jitter:
            # No waiting around. Save to the database now, but don't bother
            # with the index yet because citations are being done imminently.
            opinion.cluster.save(index=False)
            opinion.save(index=False)
        else:
            # Save to the index now, citations come later, commit comes
            # according to schedule
            opinion.cluster.save(index=False)
            opinion.save(index=True)
    except Exception:
        print("****Error saving text to the db for: %s****\n%s" %
              (opinion, traceback.format_exc()))
        return

    # Identify and link citations within the document content
    find_citations_for_opinion_by_pks.apply_async(
        ([opinion.pk],),
        countdown=random.randint(0, 3600)
    )