Esempio n. 1
0
def get_records(count=1):
    """Retrieve the requested number of records.

    Keyword Arguments:
        count (int): The number of records to retrieve.

    Returns:
        dict: The retrieved records.
    """
    url = get_foia_url('Search/SubmitSimpleQuery')
    params = {
        'collectionMatch': 'Clinton_Email',
        'searchText': '*',
        'beginDate': 'false',
        'endDate': 'false',
        'postedBeginDate': 'false',
        'postedEndDate': 'false',
        'caseNumber': 'false',
        'page': 1,
        'start': 0,
        'limit': count
    }

    # SSL certificate verification fails. To get around this,
    # ignore verification of the SSL certificate.
    response = requests.get(url, params=params, verify=False)

    text = clean_timestamps(response.text)
    records = json.loads(text)

    return records
Esempio n. 2
0
def download(email):
    """Process the provided dictionary of email metadata.

    Download the corresponding PDF and extract plain text from it.

    Arguments:
        email (dict): A dictionary of email metadata. For example,
            {
                'from': 'H',
                'pdfLink': 'DOCUMENTS/HRCEmail_August_Web/IPS-0128/DOC_0C05775316/C05775316.pdf',
                'docDate': 1277956800000,
                'documentClass': 'Clinton_Email_August_Release',
                'messageNumber': '',
                'to': 'preines',
                'caseNumber': 'F-2014-20439',
                'subject': 'TEST',
                'originalLink': None,
                'postedDate': 1440993600000
            }

    Returns:
        dict: Containing the provided metadata, transformed if necessary,
            in addition to text from the downloaded PDF.
    """
    if email['from'] not in INTERESTING_SENDERS:
        return

    # TODO: These timestamps only give dates, not times. However, the emails
    # themselves contain dates and times. Extract these.
    email['sent'] = datetime_from_timestamp(email.pop('docDate'))
    email['pdf_posted'] = datetime_from_timestamp(email.pop('postedDate'))

    # TODO: Don't download the email if it's present on disk. Return None
    # so that a duplicate record isn't written to the database.
    url = get_foia_url(email.pop('pdfLink'))
    email['pdf_link'] = url

    # SSL certificate verification fails. To get around this,
    # ignore verification of the SSL certificate.
    response = requests.get(url, verify=False)
    pdf = response.content

    filename = get_filename(url)
    email['document_id'] = filename

    pdf_path, text = save_and_extract(filename, pdf)
    email['pdf_path'] = pdf_path
    
    body, is_redacted = get_body(text)
    email['body'] = body
    email['is_redacted'] = is_redacted

    return email