Example #1
0
#!/usr/bin/env python
"""This module does blah blah."""
import datetime
import time
import urllib.parse
from newspaper import Article
from newspaper.article import ArticleException
from gensim.summarization import keywords as gen_kwds
from common import airtab_articles as airtab, wrap_from_module

wrap_it_up = wrap_from_module('reading_list/muh_news.py')


def scrape_pages():
    t0, i = time.time(), 0
    # records = airtab.get_all(view='needs news_scraper.py')
    records = airtab.get_all(formula="script_ran = ''")
    for record in records:
        this_dict = {}
        url = record['fields']['clean url']
        this_dict['script_ran'] = time.strftime('%c')
        try:
            article = Article(url)
            article.download()
            article.parse()
            this_dict['author2'] = ', '.join(article.authors)
            this_dict['body'] = article.text
            article.nlp()
            this_dict['art_kwds'] = ', '.join(article.keywords)
            this_dict['excerpt2'] = article.summary
            i += 1
Example #2
0
             ['kcdc', 'bk'],
             ['ccdc', 'bk'],
             ['acdc', 'bk'],
             ['hcdc', 'bk'],
             ['jcdc', 'bk']]


def ensure_dir(dir_path):
    """Create a directory at the given path, including parents.

    Raises exception if path specifies a file, but not if directory exists.
    """
    Path(dir_path).mkdir(parents=True, exist_ok=True)


wrap_it_up = wrap_from_module('jail_scrapers/pdf_stuff.py')


def web_to_pdf():
    # filters for recently verified intakes w/out dc_id.
    # for records meeting that criteria, create pdf & store locally
    t0, i = time.time(), 0
    # pdf_formula = "AND(dc_id = '', hours_since_verification < 6, jail != 'jcj')"
    records = airtab.get_all(view='needs pdf')
    i = len(records)
    for record in records:
        url = record['fields']['link']
        jail = record['fields']['jail']
        if jail in {'mcdc', 'prcdf', 'lcdc', 'jcadc'}:
            fn = f"./output/{jail}/{record['fields']['intake_number']}.pdf"
        else:
Example #3
0
#!/usr/bin/env python
import random
import sys
import time
import urllib.parse
from datetime import date, datetime, timezone

import requests
from bs4 import BeautifulSoup
from nameparser import HumanName

import standardize
from common import airtab_intakes as airtab
from common import muh_headers, wrap_from_module

wrap_it_up = wrap_from_module('jail_scrapers/scrapers.py')


def get_name(raw_name, this_dict):
    name = HumanName(raw_name)
    name.capitalize()
    this_dict['first_name'] = name.first
    this_dict['last_name'] = name.last
    this_dict['middle_name'] = name.middle
    this_dict['suffix'] = name.suffix


def update_record(this_dict, soup, m, lea_parser=None, raw_lea=''):
    if this_dict['recent_text'] != m['fields']['recent_text']:
        this_dict['updated'] = True
        this_dict['html'] = soup.prettify()
Example #4
0
#!/usr/bin/env python
"""This module imports each item from my Safari readinglist into an Airtable base.
Aftwerwards, I manually remove all links from the readinglist.
Ideally, the code is run a couple times a week. From the command line, it's:
python3 ~/code/reading_list/get_reading_list.py
"""
import os
import plistlib
import time
from common import airtab_articles as airtab, wrap_from_module

wrap_it_up = wrap_from_module('reading_list/get_reading_list.py')

input_file = os.path.join(os.environ['HOME'], 'Library/Safari/Bookmarks.plist')

t0 = time.time()


def get_reading_list():
    with open(input_file, 'rb') as plist_file:
        plist = plistlib.load(plist_file)
    children = plist['Children']
    for child in children:
        if child.get('Title', None) == 'com.apple.ReadingList':
            bookmarks = child.get('Children')
            return bookmarks


def parse_reading_list(bookmarks):
    new_links = 0
    if bookmarks:
Example #5
0
# !/usr/bin/env python
"""This module does blah blah."""
import re
import time
import unicodedata

from io import BytesIO

import requests

from PyPDF2 import PdfFileReader

from common import airtab_mdoc, airtab_mdoc2, dc, tw, muh_headers, wrap_from_module

wrap_it_up = wrap_from_module('mdoc_scraper/mdoc_covid.py')


def tweet_it(obj, tweet_txt):
    media_ids = []
    image_list = obj.normal_image_url_list[:2]
    for image in image_list:
        r = requests.get(image)
        r.raise_for_status()
        uploadable = BytesIO(r.content)
        response = tw.upload_media(media=uploadable)
        media_ids.append(response['media_id'])
    tweet = tw.update_status(status=tweet_txt, media_ids=media_ids)
    return tweet['id_str']


def web_to_dc(this_dict):
Example #6
0
# !/usr/bin/env python
"""This module does blah blah."""
import time
from io import BytesIO
import requests
from common import airtab_sos as airtab, dc, tw, muh_headers, wrap_from_module

wrap_it_up = wrap_from_module('sos_scraper.py')


def get_images(dc_id):
    media_ids = []
    obj = dc.documents.get(dc_id)
    image_list = obj.normal_image_url_list[:4]
    for image in image_list:
        res = requests.get(image)
        res.raise_for_status()
        uploadable = BytesIO(res.content)
        response = tw.upload_media(media=uploadable)
        media_ids.append(response['media_id'])
    return media_ids


def scrape_exec_orders():
    """This function does blah blah."""
    t0, new, total = time.time(), 0, 0
    url = 'https://www.sos.ms.gov/content/executiveorders/EOFunctions.asmx/ListExecutiveOrders'
    r = requests.post(url, headers=muh_headers)
    annoying_blob = r.json()['d']
    rows = annoying_blob.replace('~~', '').split('^')
    for row in rows[:10]:
Example #7
0
# !/usr/bin/env python
"""This module does blah blah."""
import csv
import time
import requests
import tweepy

from common import airtab_homicides_by_cop as airtab, tw, wrap_from_module

wrap_it_up = wrap_from_module(module='police_shootings.py')


def wapo_fatal_shootings_by_ms_leos():
    """This function does blah blah."""
    t0, i = time.time(), 0
    ms_list = []
    url = 'https://raw.githubusercontent.com/washingtonpost/data-police-shootings/master/fatal-police-shootings-data.csv'
    with requests.Session() as s:
        r = s.get(url)
        data = r.content.decode('utf-8')
        csv_reader = csv.reader(data.splitlines(), delimiter=',')
        full_list = list(csv_reader)
        for row in full_list:
            if row[9] == "MS":
                ms_list.append(row)
    print(f'total ms fatalities: {len(ms_list)}')
    for row in ms_list:
        this_dict = {}
        this_dict['id'] = row[0]
        this_dict['name'] = row[1]
        this_dict['date'] = row[2]
Example #8
0
#!/usr/bin/env python
"""This module accesses several airtable 'views' that contain records that need some additional processing."""
import re
import time

import requests

from bs4 import BeautifulSoup
from cloudinary import uploader

from common import airtab_intakes as airtab
from common import cloudinary, dc, wrap_from_module

wrap_it_up = wrap_from_module('jail_scrapers/polish_data.py')


def polish_data():
    """This function does runs each of the module's functions."""
    get_pixelated_mug()
    update_summary()
    get_charges_from_recent_text()
    retry_getting_mugshot()
    remove_weird_character()
    parse_charge_1()
    fix_charges_to_by_lines()
    get_full_text()
    get_all_intake_deets()
    update_dc_fields()


def get_pixelated_mug():