Esempio n. 1
0
def get_doc_path(doc_id):
    '''
    Get path for a single document, if it exists
    Note: note very performant for large number of calls, if checking multiple use get_doc_path_many
    Input:
        - doc_id (str): a single document id e.g. 'ilnd;;1-16-cv-02872_110'
    Output:
        - (Path) returns the path to the document pdf if it exists e.g. Path('.../ilnd/docs/ilnd;;1-16-cv-002872_110_u123131_t123123.pdf'),
            otherwise returns None if document does not exist
    '''
    # Get the court from the doc_id
    ucid, _ = doc_id.split('_', maxsplit=1)
    court = dtools.parse_ucid(ucid)['court']
    year_part = decompose_caseno(ucid)['year']
    import pdb
    pdb.set_trace()

    # Use glob to get candidate list (will also return attachments with subindexes e.g. "_3...", "_3_1...", "_3_2...")
    cand = (settings.PACER_PATH / court / 'docs' / year_part).glob(doc_id +
                                                                   '*')
    # Filter to the correct doc id
    for fpath in cand:

        cand_id = parse_document_fname(fpath.name).get('doc_id') or None
        if cand_id == doc_id:
            return fpath
Esempio n. 2
0
def get_expected_path(ucid,
                      subdir='json',
                      pacer_path=settings.PACER_PATH,
                      def_no=None,
                      update_ind=None):
    '''
    Find the expected path of case-level data files

    Inputs:
        - ucid (str): case ucid
        - subdir (str): the subdirectory to look in (see scrapers.PacerCourtDir), one of 'html', 'json', 'docs', 'summaries', 'members'
        - pacer_path (Path): path to pacer data directory
        - def_no (str or int): the defendant no., if specifying a defendant-specific docket
        - update_ind (int): update index (for html files), passed through to generate_docket_filename
    Output:
        (Path) the path to where the file should exist (regardless of whether it does or not)
    '''
    # Get the caseno from the ucid
    ucid_data = dtools.parse_ucid(ucid)
    court, case_no = ucid_data['court'], ucid_data['case_no']
    year_part = decompose_caseno(case_no)['year']

    # Build the filepath
    ext = SUBDIR_EXTENSIONS[subdir]
    fname = generate_docket_filename(case_no,
                                     ext=ext,
                                     def_no=def_no,
                                     ind=update_ind)

    return pacer_path / court / subdir / year_part / fname
Esempio n. 3
0
def parse_document_fname(fname, parse_ucid_data=False):
    '''
    Parse a document filename, return the component parts as a dict.
    Note this can take the old format of document filnemae (that didn't include the user and timestamp parts)

    Inputs:
        - fname (str): a document filename e.g. "ilnd;;1-16-cv-11315_20_u7905a347_t201007.pdf"
        - parse_ucid_data (bool): whether to also parse the ucid data, and store that under the 'ucid_data' key
    Output:
        (dict) metadata coming from the filename e.g.
    '''

    res = {}

    re_doc_id = r"(?P<ucid_no_colon>[a-z0-9;\-]+)_(?P<index>\d+)(_(?P<att_index>\d+))?"
    re_download_name = rf"(?P<doc_id>{re_doc_id})_u(?P<user_hash>[a-z0-9]+)_t(?P<download_time>[0-9\-]+)\.(?P<ext>.+)"
    re_old = rf"(?P<doc_id>{re_doc_id})(?P<ext>.+)"  #old format

    # Try standard name first
    match = re.match(re_download_name, fname)
    # If not, try the old naming system
    if not match:
        match = re.match(re_old, fname)

    if match:
        res = match.groupdict()
        res['ucid'] = res['ucid_no_colon'].replace('-', ':', 1)
        del res['ucid_no_colon']

    # Parse the date
    if res.get('download_time'):
        res['download_time'] = datetime.strptime(res['download_time'],
                                                 FMT_TIME_FNAME)

    if res and parse_ucid_data:
        parsed_ucid = dtools.parse_ucid(res['ucid'])
        res['ucid_data'] = {
            'court': parsed_ucid['court'],
            **decompose_caseno(parsed_ucid['case_no'])
        }
    return res
Esempio n. 4
0
def get_expected_path(ucid,
                      ext='json',
                      pacer_path=settings.PACER_PATH,
                      def_no=None):
    '''
    Find the expected path of the json or html file for the case

    Inputs:
        - ucid (str): case ucid
        - ext (str): 'json' or 'html'
        - pacer_path (Path): path to pacer data directory
    Output:
        (Path) the path to where the file should exist (regardless of whether it does or not)

    '''
    if ext not in ('html', 'json'):
        raise ValueError("ext must be either 'html' or 'json'")

    ucid_data = dtools.parse_ucid(ucid)
    court, case_no = ucid_data['court'], ucid_data['case_no']
    fname = generate_docket_filename(case_no, ext=ext, def_no=def_no)

    return pacer_path / court / ext / fname
Esempio n. 5
0
def parse_docket_input(query_results, docket_input, case_type, court):
    '''
    Figure out the input for the docket module
    Inputs:
        - query_results (list): List of query results (paths to htmls) from query module, will be [] if query scraper didn't run
        - docket_input (Path): the docket input argument
        - case_type (str)
        - court (str): court abbreviation
        - allow_def_stub
     Outputs:
        - input_data (list of dicts): data to be fed into docket scraper
            [{'case_no': 'caseA' 'latest_date': '...'},...] ('latest_date' may or may not be present)
    '''
    if len(query_results) == 0 and docket_input == None:
        raise ValueError('Please provide a docket_input')

    # Check all html scenarios first
    is_html = False

    # If there are results from query scraper, use those
    if len(query_results):
        is_html = True
        query_htmls = query_results

    elif not docket_input.exists():
        logging.info(f'docket_input does not exist ({docket_input})')
        return []
    # If the input is a directory, get all query htmls in directory
    elif docket_input.is_dir():
        is_html = True
        query_htmls = list(docket_input.glob('*.html'))

    # If single html query, then singleton list for query_htmls
    elif docket_input.suffix == '.html':
        is_html = True
        query_htmls = [docket_input]

    # If any of the html scenarios reached, build case list from query_htmls
    if is_html:
        case_nos = build_case_list_from_queries(query_htmls, case_type, court)
        input_data = [{'case_no': cn} for cn in case_nos]

    # CSV case
    else:
        if docket_input.suffix != '.csv':
            raise ValueError('Cannot interpret docket_input')
        df = pd.read_csv(docket_input, dtype={'def_no': str})

        # Parse ucid and create court and case_no columns
        df = df.assign(**dtools.parse_ucid(df.ucid))
        # Restrict to just ucids in this court and drop duplicates
        df.query("court==@court", inplace=True)
        df.drop_duplicates('case_no', inplace=True)

        # Fill na for def_no before to_dict
        if 'def_no' in df.columns:
            df['def_no'].fillna('', inplace=True)

        # Keep just case_no and get lastest_date if it's there
        keepcols = [
            col for col in ('case_no', 'latest_date', 'def_no')
            if col in df.columns
        ]

        input_data = df[keepcols].to_dict('records')

    return input_data