#!/usr/bin/env python """This module does blah blah.""" import datetime import time import urllib.parse from newspaper import Article from newspaper.article import ArticleException from gensim.summarization import keywords as gen_kwds from common import airtab_articles as airtab, wrap_from_module wrap_it_up = wrap_from_module('reading_list/muh_news.py') def scrape_pages(): t0, i = time.time(), 0 # records = airtab.get_all(view='needs news_scraper.py') records = airtab.get_all(formula="script_ran = ''") for record in records: this_dict = {} url = record['fields']['clean url'] this_dict['script_ran'] = time.strftime('%c') try: article = Article(url) article.download() article.parse() this_dict['author2'] = ', '.join(article.authors) this_dict['body'] = article.text article.nlp() this_dict['art_kwds'] = ', '.join(article.keywords) this_dict['excerpt2'] = article.summary i += 1
['kcdc', 'bk'], ['ccdc', 'bk'], ['acdc', 'bk'], ['hcdc', 'bk'], ['jcdc', 'bk']] def ensure_dir(dir_path): """Create a directory at the given path, including parents. Raises exception if path specifies a file, but not if directory exists. """ Path(dir_path).mkdir(parents=True, exist_ok=True) wrap_it_up = wrap_from_module('jail_scrapers/pdf_stuff.py') def web_to_pdf(): # filters for recently verified intakes w/out dc_id. # for records meeting that criteria, create pdf & store locally t0, i = time.time(), 0 # pdf_formula = "AND(dc_id = '', hours_since_verification < 6, jail != 'jcj')" records = airtab.get_all(view='needs pdf') i = len(records) for record in records: url = record['fields']['link'] jail = record['fields']['jail'] if jail in {'mcdc', 'prcdf', 'lcdc', 'jcadc'}: fn = f"./output/{jail}/{record['fields']['intake_number']}.pdf" else:
#!/usr/bin/env python import random import sys import time import urllib.parse from datetime import date, datetime, timezone import requests from bs4 import BeautifulSoup from nameparser import HumanName import standardize from common import airtab_intakes as airtab from common import muh_headers, wrap_from_module wrap_it_up = wrap_from_module('jail_scrapers/scrapers.py') def get_name(raw_name, this_dict): name = HumanName(raw_name) name.capitalize() this_dict['first_name'] = name.first this_dict['last_name'] = name.last this_dict['middle_name'] = name.middle this_dict['suffix'] = name.suffix def update_record(this_dict, soup, m, lea_parser=None, raw_lea=''): if this_dict['recent_text'] != m['fields']['recent_text']: this_dict['updated'] = True this_dict['html'] = soup.prettify()
#!/usr/bin/env python """This module imports each item from my Safari readinglist into an Airtable base. Aftwerwards, I manually remove all links from the readinglist. Ideally, the code is run a couple times a week. From the command line, it's: python3 ~/code/reading_list/get_reading_list.py """ import os import plistlib import time from common import airtab_articles as airtab, wrap_from_module wrap_it_up = wrap_from_module('reading_list/get_reading_list.py') input_file = os.path.join(os.environ['HOME'], 'Library/Safari/Bookmarks.plist') t0 = time.time() def get_reading_list(): with open(input_file, 'rb') as plist_file: plist = plistlib.load(plist_file) children = plist['Children'] for child in children: if child.get('Title', None) == 'com.apple.ReadingList': bookmarks = child.get('Children') return bookmarks def parse_reading_list(bookmarks): new_links = 0 if bookmarks:
# !/usr/bin/env python """This module does blah blah.""" import re import time import unicodedata from io import BytesIO import requests from PyPDF2 import PdfFileReader from common import airtab_mdoc, airtab_mdoc2, dc, tw, muh_headers, wrap_from_module wrap_it_up = wrap_from_module('mdoc_scraper/mdoc_covid.py') def tweet_it(obj, tweet_txt): media_ids = [] image_list = obj.normal_image_url_list[:2] for image in image_list: r = requests.get(image) r.raise_for_status() uploadable = BytesIO(r.content) response = tw.upload_media(media=uploadable) media_ids.append(response['media_id']) tweet = tw.update_status(status=tweet_txt, media_ids=media_ids) return tweet['id_str'] def web_to_dc(this_dict):
# !/usr/bin/env python """This module does blah blah.""" import time from io import BytesIO import requests from common import airtab_sos as airtab, dc, tw, muh_headers, wrap_from_module wrap_it_up = wrap_from_module('sos_scraper.py') def get_images(dc_id): media_ids = [] obj = dc.documents.get(dc_id) image_list = obj.normal_image_url_list[:4] for image in image_list: res = requests.get(image) res.raise_for_status() uploadable = BytesIO(res.content) response = tw.upload_media(media=uploadable) media_ids.append(response['media_id']) return media_ids def scrape_exec_orders(): """This function does blah blah.""" t0, new, total = time.time(), 0, 0 url = 'https://www.sos.ms.gov/content/executiveorders/EOFunctions.asmx/ListExecutiveOrders' r = requests.post(url, headers=muh_headers) annoying_blob = r.json()['d'] rows = annoying_blob.replace('~~', '').split('^') for row in rows[:10]:
# !/usr/bin/env python """This module does blah blah.""" import csv import time import requests import tweepy from common import airtab_homicides_by_cop as airtab, tw, wrap_from_module wrap_it_up = wrap_from_module(module='police_shootings.py') def wapo_fatal_shootings_by_ms_leos(): """This function does blah blah.""" t0, i = time.time(), 0 ms_list = [] url = 'https://raw.githubusercontent.com/washingtonpost/data-police-shootings/master/fatal-police-shootings-data.csv' with requests.Session() as s: r = s.get(url) data = r.content.decode('utf-8') csv_reader = csv.reader(data.splitlines(), delimiter=',') full_list = list(csv_reader) for row in full_list: if row[9] == "MS": ms_list.append(row) print(f'total ms fatalities: {len(ms_list)}') for row in ms_list: this_dict = {} this_dict['id'] = row[0] this_dict['name'] = row[1] this_dict['date'] = row[2]
#!/usr/bin/env python """This module accesses several airtable 'views' that contain records that need some additional processing.""" import re import time import requests from bs4 import BeautifulSoup from cloudinary import uploader from common import airtab_intakes as airtab from common import cloudinary, dc, wrap_from_module wrap_it_up = wrap_from_module('jail_scrapers/polish_data.py') def polish_data(): """This function does runs each of the module's functions.""" get_pixelated_mug() update_summary() get_charges_from_recent_text() retry_getting_mugshot() remove_weird_character() parse_charge_1() fix_charges_to_by_lines() get_full_text() get_all_intake_deets() update_dc_fields() def get_pixelated_mug():