def get_pixelated_mug(): """This function uploads the raw image to cloudinary and then uploads the pixelated version to the airtable record.""" t0, i = time.time(), 0 needs_pix_img_formula = "AND(PHOTO != '', PIXELATED_IMG = '', hours_since_verification < 24, jail != 'jcdc')" records = airtab.get_all(formula=needs_pix_img_formula) for record in records: this_dict = {} url = record["fields"]["PHOTO"][0]["url"] r = requests.get(url) content_type = r.headers['Content-Type'] # print(content_type) if content_type == 'image/jpeg': try: upload_response = uploader.upload(url, opacity=40, effect="blur:400") time.sleep(1) this_dict["PIXELATED_IMG"] = [{ "url": upload_response['secure_url'] }] airtab.update(record['id'], this_dict) except cloudinary.exceptions.Error as err1: print("cloudinary can't accept that shit: ", err1) except AttributeError as err2: print('Attribute Error for cloudinary upload: ', err2) else: print('this shit was some really weird content type:', content_type) wrap_it_up(t0, new=i, total=len(records), function='get_pixelated_mug')
def update_record(this_dict, soup, m, lea_parser=None, raw_lea=''): if this_dict['recent_text'] != m['fields']['recent_text']: this_dict['updated'] = True this_dict['html'] = soup.prettify() if lea_parser: lea_parser(raw_lea) airtab.update(m['id'], this_dict, typecast=True)
def retry_getting_mugshot(): t0, i = time.time(), 0 needs_pic_formula = "AND(PHOTO = '', hours_since_verification < 12, jail != 'lcdc')" records = airtab.get_all(formula=needs_pic_formula) print("we're gonna retry getting mugs for", len(records), "records...") for record in records: this_dict = {} r = requests.get(record['fields']['link']) if record['fields']['jail'] == 'jcdc': soup = BeautifulSoup(r.text, 'html.parser').find(id='cms-content') img_tag = soup.find('div', class_='inmate_profile_image').img if img_tag['alt'] != 'Image Not Availble': this_dict[ 'img_src'] = f"https://www.jonesso.com/{img_tag['src']}" this_dict['PHOTO'] = [{'url': this_dict['img_src']}] # else: # print('image not currently available') elif record['fields']['jail'] == 'hcdc': soup = BeautifulSoup(r.text, 'html.parser') try: img_src = 'http://www.co.hinds.ms.us' + soup.find( 'img', {'align': 'middle'})['src'] if requests.get( img_src).headers['Content-Type'] == 'image/jpeg': this_dict['img_src'] = img_src this_dict['PHOTO'] = [{'url': img_src}] else: print('image source isn\'t actually an image') except TypeError: print('no img tag in intake html') elif record['fields']['jail'] == 'kcdc': soup = BeautifulSoup(r.text, 'html.parser').find(id='cms-content') try: img_tag = soup.find('img') except AttributeError: # print('no img tag in intake html') continue if soup.img: img_src_raw = soup.img['src'] if img_src_raw.startswith( 'templates/kempercountysheriff.com/images/inmates'): this_dict[ 'img_src'] = f"https://www.kempercountysheriff.com/{img_src_raw}" this_dict['PHOTO'] = [{'url': this_dict['img_src']}] elif record['fields']['jail'] == 'acdc': soup = BeautifulSoup(r.text, 'html.parser').find( 'div', class_='blog-content-container') try: img_tag = soup.find('img') this_dict['img_src'] = img_tag.get('src') this_dict['PHOTO'] = [{'url': this_dict['img_src']}] except AttributeError: # print('no img tag in intake html') continue else: print( f"awww hell... this one is from the {record['fields']['jail']} docket/scraper..." ) airtab.update(record['id'], this_dict) wrap_it_up(t0, new=i, total=len(records), function='retry_getting_mugshot')
def get_all_intake_deets(): t0, i = time.time(), 0 jcadc_deets_formula = "AND(jail = 'jcadc', charges = '', recent_text != '')" records = airtab.get_all(formula=jcadc_deets_formula, fields='recent_text') for record in records: charges = [] bond_ammts = [] classifications = [] this_dict = {} txt_str = record['fields']['recent_text'] chunks = txt_str.split('\nRequest Victim Notification\n') match_1 = re.search(r"(\w+)\s+(Male|Female)", chunks[0]) try: raw_race = match_1.group(1) if raw_race == 'AVAILABLE': this_dict['race'] = 'U' else: this_dict['race'] = raw_race[0] this_dict['sex'] = match_1.group(2)[0] except AttributeError: print('there isnt race/sex info') try: this_dict['intake_weight'] = re.search(r"(\d+) Pounds", chunks[0]).group(1) except AttributeError: print('there isnt weight info') try: this_dict['intake_height'] = re.search(r"(\d Ft. \d+ In.)", chunks[0]).group(1) except AttributeError: print('idk how tall this person is') try: this_dict['intake_eye'] = re.search(r"(\w+)\s+Eyes", chunks[0]).group(1) except AttributeError: print('eye color is a mystery') this_dict['intake_age'] = re.search(r"(\d\d) Years Old", chunks[0]).group(1) crim_details = chunks[1].splitlines() for ln in crim_details: results = re.search(r"([MF]\w+) - Bond: (\$.*)", ln) if results: bond_ammts.append(results.group(2)) classifications.append(results.group(1)) elif ', ' in ln: charges.append(f"\"{ln}\"") else: charges.append(ln) this_dict['charges'] = ', '.join(charges) this_dict['bond_ammounts'] = '\n'.join(bond_ammts) this_dict['charge_classifications'] = ', '.join(classifications) airtab.update(record['id'], this_dict, typecast=True) i += 1 wrap_it_up(t0, new=i, total=len(records), function='get_all_intake_deets')
def pdf_to_dc(quiet=True): # filters for recently verified intakes w/out dc_id. # for records meeting that criteria, create pdf & store locally t0, i = time.time(), 0 for jail in jails_lst: if not quiet: print(f"checking {jail[0]}. . .") output_path = os.path.join("output", jail[0]) try: ensure_dir(output_path) except NotADirectoryError as err: print(f"Skipping {jail[0]}: {err}") continue for fn in glob.glob(os.path.join(output_path, '*.pdf')): if not quiet: print(f"uploading {fn} . . .") try: obj = dc.documents.upload(fn) except requests.exceptions.ReadTimeout: time.sleep(7) continue obj = dc.documents.get(obj.id) while obj.status != 'success': time.sleep(7) obj = dc.documents.get(obj.id) obj.access = "public" this_dict = {"jail": jail[0]} obj.data = this_dict obj.put() this_dict["dc_id"] = str(obj.id) print(f"successfully uploaded {obj.id}. . .") this_dict["dc_title"] = obj.title this_dict["dc_access"] = obj.access this_dict["dc_pages"] = obj.pages this_dict["PDF"] = obj.pdf_url this_dict["dc_canonical_url"] = obj.canonical_url this_dict["dc_resources_page_image"] = obj.normal_image_url try: full_text = obj.full_text.decode("utf-8") except AttributeError as err: full_text = obj.full_text this_dict["dc_full_text"] = os.linesep.join([s for s in full_text.splitlines() if s]) # record = airtab.match(jail[1], this_dict["dc_title"], view='needs pdf') record = airtab.match(jail[1], this_dict["dc_title"], sort=[('dc_id', 'asc'), ('initial_scrape', 'desc')]) airtab.update(record["id"], this_dict, typecast=True) if jail[0] == 'lcdc': os.rename(fn, f'/Users/blakefeldman/code/daily-journal-jail-data/pdfs/lee/{this_dict["dc_title"]}.pdf') else: send2trash.send2trash(fn) i += 1 time.sleep(7) wrap_it_up(t0, new=i, total=i, function='pdf_to_dc')
def get_full_text(): t0, i = time.time(), 0 records = airtab.get_all(formula="AND(dc_id != '', dc_full_text = '')", fields=['dc_id']) for record in records: this_dict = {} obj = dc.documents.get(record['fields']['dc_id']) this_dict["dc_title"] = obj.title this_dict["dc_access"] = obj.access this_dict["dc_pages"] = obj.pages this_dict["dc_full_text"] = obj.full_text.decode("utf-8") airtab.update(record["id"], this_dict) i += 1 wrap_it_up(t0, new=i, total=len(records), function='get_full_text')
def update_dc_fields(): records = airtab.get_all(view='need dc urls updated', fields='dc_id', max_records=100) print(len(records), ' records need updated documentcloud URLs.') for record in records: this_dict = {} dc_id = record['fields'].get('dc_id') obj = dc.documents.get(dc_id) this_dict["PDF"] = obj.pdf_url this_dict["dc_canonical_url"] = obj.canonical_url this_dict["dc_resources_page_image"] = obj.normal_image_url airtab.update(record['id'], this_dict) time.sleep(.3)
def update_summary(this_many=150): """This function updates the record summary. The reason we have this field, rather than just use the 'blurb' field, is bc the gallery view works better with a text field than it does with a formula field. Because this view will regularly be packed full of records, the default max records is 100.""" t0, i = time.time(), 0 # outdated_summary_formula = "AND(blurb != '#ERROR!', blurb != summary)" # records = airtab.get_all(formula=outdated_summary_formula, fields="blurb", max_records=this_many) records = airtab.get_all(view='needs updated summary', fields="blurb", max_records=this_many) for record in records: this_dict = {} this_dict["summary"] = record["fields"]["blurb"] airtab.update(record["id"], this_dict) wrap_it_up(t0, new=i, total=len(records), function='update_summary')
def remove_weird_character(): t0, i = time.time(), 0 remove_wierd_character_formula = "AND(hours_since_verification > 12, FIND('ã', recent_text) > 1)" records = airtab.get_all(formula=remove_wierd_character_formula, fields='recent_text') for record in records: this_dict = {} x = record['fields']['recent_text'].find('ã') y = record['fields']['recent_text'].find('\n', x) this_dict['recent_text'] = record['fields']['recent_text'].replace( record['fields']['recent_text'][x:y], '') airtab.update(record['id'], this_dict) i += 1 wrap_it_up(t0, new=i, total=len(records), function='remove_weird_character')
def fix_charges_to_by_lines(): t0, i = time.time(), 0 records = airtab.get_all( formula="AND(TEST_FORMULA != '', TEST_RESULT = '')", fields='charges') for record in records: this_dict = {} cleaner = [] mess = record['fields']['charges'].replace('", ', '"\n').replace( ', "', '\n"').splitlines() for c in mess: if c.startswith('"'): cleaner.append(c.replace('"', '')) else: for d in c.split(', '): cleaner.append(d) this_dict['TEST_RESULT'] = '\n'.join(cleaner) airtab.update(record['id'], this_dict) i += 1 wrap_it_up(t0, new=i, total=len(records), function='fix_charges_to_by_lines')
def get_dor_if_possible(this_many=50): t0, i = time.time(), 0 # records = airtab.get_all(view="check for DOR") dor_formula = "AND(OR(jail = 'kcdc', jail = 'tcdc', jail = 'ccdc', jail = 'jcdc'), DOR = '', hours_since_verification > 6, hours_since_verification < 48)" records = airtab.get_all(formula=dor_formula, max_records=this_many) total = len(records) for record in records: this_dict = {} try: r = requests.get(record["fields"]["link"]) except requests.ConnectionError as err: print(f"Skipping {record['fields']['link']}: {err}") time.sleep(5) continue soup = BeautifulSoup(r.text, "html.parser") data = [] for string in soup.stripped_strings: data.append(str(string)) if "Release Date:" in data: options = { "quiet": "", "footer-font-size": 10, "footer-left": record["fields"]["link"], "footer-right": time.strftime('%c'), } directory = f"./output/{record['fields']['jail']}/updated" try: ensure_dir(directory) file_name = f"{record['fields']['bk']} (final).pdf" fn = os.path.join(directory, file_name) pdfkit.from_url(record["fields"]["link"], fn, options=options) except NotADirectoryError as err: print(f"Can't write PDF: {err}") this_dict["DOR"] = datetime.datetime.strptime( data[1 + data.index("Release Date:")], "%m-%d-%Y - %I:%M %p" ).strftime('%m/%d/%Y %H:%M') airtab.update(record["id"], this_dict) i += 1 wrap_it_up(t0, i, total, function='get_dor_if_possible')
def parse_charge_1(): t0, i = time.time(), 0 needs_charge_1_parsed_formula = "AND(OR(jail = 'mcdc', jail = 'prcdf'), charge_1_statute = '', hours_since_initial_scrape < 48, charge_1 != '', charge_1 != 'HOLDHOLD', charge_1 != 'DRUGDRUG COURT', charge_1 != 'HLD Other AgencyHold for other Agency')" records = airtab.get_all(formula=needs_charge_1_parsed_formula) for record in records: this_dict = {} x = None if re.search("[)][A-Z]", record["fields"]["charge_1"]): x = re.search("[)][A-Z]", record["fields"]["charge_1"]) elif re.search("[0-9][A-Z]", record["fields"]["charge_1"]): x = re.search("[0-9][A-Z]", record["fields"]["charge_1"]) if x: this_dict["charge_1_statute"] = record["fields"][ "charge_1"][:x.start() + 1] this_dict["charge_1_title"] = record["fields"]["charge_1"][x.end( ) - 1:] try: airtab.update(record["id"], this_dict) i += 1 except requests.exceptions.HTTPError as err: print(err) continue wrap_it_up(t0, new=i, total=len(records), function='parse_charge_1')
def ccj_scraper(): t0, new_intakes, total_intakes = time.time(), 0, 0 main_url = 'http://www.calhounso.org/page.php?id=7' r = requests.get(main_url, headers=muh_headers) soup = BeautifulSoup(r.text, 'html.parser').find(id='cms_body_content') try: intakes = soup.table.tbody.find_all('td') except AttributeError: return for intake in intakes: total_intakes += 1 this_dict = {'jail': 'ccj'} data = [] link = intake.find('a') if link: this_dict['link'] = link.get('href') this_dict['bk'] = this_dict['link'].replace( 'http://www.vinelink.com/vinelink/servlet/SubjectSearch?siteID=25000&agency=18&offenderID=', '') for string in intake.stripped_strings: if string.startswith('.') and string.endswith('.'): pass else: data.append(str(string)) try: get_name(data[0], this_dict) except IndexError: print('skipping empty td') continue raw_doi = data[1] if raw_doi == date.today().strftime('%m/%d/%Y'): this_dict['DOI'] = datetime.now().strftime('%m/%d/%Y %I:%M%p') else: this_dict['DOI'] = f"{raw_doi} 11:59pm" remaining_data = ' '.join(data[2:]) DOR_index = remaining_data.find('Released') if DOR_index != -1: this_dict['charges'] = remaining_data[:DOR_index].strip() raw_dor = remaining_data[DOR_index + 8:].strip() if raw_dor == date.today().strftime('%m/%d/%Y'): this_dict['DOR'] = datetime.now().strftime('%m/%d/%Y %I:%M%p') else: this_dict['DOR'] = f"{raw_dor} 12:01am" else: this_dict['charges'] = remaining_data this_dict['recent_text'] = '\n'.join(data) this_dict['html'] = intake.prettify() this_dict['last_verified'] = (datetime.utcnow().replace( tzinfo=timezone.utc).strftime('%m/%d/%Y %H:%M')) m = airtab.match('recent_text', this_dict['recent_text'], view='ccj') if not m: airtab.insert(this_dict, typecast=True) new_intakes += 1 else: this_dict['updated'] = True airtab.update(m['id'], this_dict, typecast=True) time.sleep(0.2) wrap_it_up(function='ccj_scraper', t0=t0, new=new_intakes, total=total_intakes)
def jcadc_scraper(): t0, new_intakes, total_intakes = time.time(), 0, 0 root = 'https://services.co.jackson.ms.us/jaildocket' r = requests.post(f"{root}/_inmateList.php?Function=count", headers=muh_headers) total_intakes = r.json() last_page = int(total_intakes / 15) pages = range(1, last_page + 1) pages = list(pages) random.shuffle( pages ) # for some reason page 1 cannot be the first page visited sometimes for pg in pages: r = requests.post( f"{root}/_inmateList.php?Function=list&Page={pg}&Order=BookDesc&Search=0", headers=muh_headers) try: intakes = r.json() except ValueError as err: print(err) continue for intake in intakes: data = [] this_dict = {'jail': 'jcadc', '_jail': ['recwShIgdZDcf4ZcJ']} this_dict['bk'] = intake["Book_Number"] this_dict['last_verified'] = (datetime.utcnow().replace( tzinfo=timezone.utc).strftime('%m/%d/%Y %H:%M')) this_dict['intake_number'] = intake["ID_Number"].strip() this_dict[ 'link'] = f"{root}/inmate/_inmatedetails.php?id={this_dict['intake_number']}" r = requests.get(this_dict['link'], headers=muh_headers) soup = BeautifulSoup(r.text, 'html.parser') for string in soup.stripped_strings: data.append(string) this_dict['recent_text'] = '\n'.join(data[1:]) m = airtab.match('bk', this_dict['bk']) if m: airtab.update(m['id'], this_dict, typecast=True) else: raw_name = ( f"{intake['Name_First_MI']} {intake['Name_Middle']} " f"{intake['Name_Last']} {intake['Name_Suffix']}") get_name(raw_name, this_dict) raw_doi = intake["BookDate"] if raw_doi == date.today().strftime('%m/%d/%Y'): this_dict['DOI'] = datetime.now().strftime( '%m/%d/%Y %I:%M%p') else: this_dict['DOI'] = f"{raw_doi} 11:59pm" this_dict['DOA'] = intake["ArrestDate"] this_dict['LEA'] = standardize.jcadc_lea( intake["Arrest_Agency"]) articles = soup.find_all('article') this_dict['html'] = ( "<html>\n<body>\n" f"{articles[0].prettify()}\n{articles[1].prettify()}\n" "</body>\n</html>") this_dict[ 'img_src'] = f"{root}/inmate/{this_dict['intake_number']}.jpg" this_dict['PHOTO'] = [{'url': this_dict['img_src']}] airtab.insert(this_dict, typecast=True) new_intakes += 1 wrap_it_up(function='jcadc_scraper', t0=t0, new=new_intakes, total=total_intakes)
def jcj_scraper(): t0, new_intakes, total_intakes = time.time(), 0, 0 urls = [ 'http://jasperso.com/inmate-roster/', 'http://jasperso.com/48-hour-release/' ] for url in urls: r = requests.get(url, headers=muh_headers) soup = BeautifulSoup(r.text, 'html.parser').find('div', id='primary') intakes = soup.find_all('div', class_='col-sm-4 inmate') total_intakes += len(intakes) for x in intakes: this_dict = {'jail': 'jcj', '_jail': ['recwzuzsimZuPpVR5']} get_name(x.h1.string.strip(), this_dict) this_dict['link'] = url data = [] for string in x.stripped_strings: data.append(str(string)) this_dict['intake_number'] = data[1 + data.index('Arrest #:')] this_dict['bk'] = data[1 + data.index('Arrest #:')] raw_doi = data[1 + data.index('Arrest Date:')] if raw_doi == date.today().strftime('%m/%d/%Y'): this_dict['DOI'] = datetime.now().strftime('%m/%d/%Y %I:%M%p') else: this_dict['DOI'] = f"{raw_doi} 11:59pm" if 'Release Date:' in data: raw_dor = data[1 + data.index('Release Date:')] if raw_dor == date.today().strftime('%m/%d/%Y'): this_dict['DOR'] = datetime.now().strftime( '%m/%d/%Y %I:%M%p') else: this_dict['DOR'] = f"{raw_dor} 12:01am" this_dict['sex'] = data[1 + data.index('Gender:')].strip()[0:1] this_dict['race'] = data[1 + data.index('Race:')].strip()[0:1] this_dict['intake_age'] = int(data[1 + data.index('Age:')]) cleaned_charges = [] charges = data[1 + data.index('Charges:'):] for charge in charges: if ', ' in charge: cleaned_charge = f"\"{charge}\"" else: cleaned_charge = charge cleaned_charges.append(cleaned_charge) this_dict['charges'] = ', '.join(cleaned_charges) this_dict['recent_text'] = '\n'.join(data) this_dict['html'] = x.prettify() this_dict['last_verified'] = (datetime.utcnow().replace( tzinfo=timezone.utc).strftime('%m/%d/%Y %H:%M')) raw_lea = data[1 + data.index('Arrest Agency:')] m = airtab.match('bk', this_dict['bk'], view='jcj') if not m: this_dict['img_src'] = x.find('img').get('src') this_dict['PHOTO'] = [{'url': this_dict['img_src']}] this_dict['LEA'] = standardize.jcj_lea(raw_lea) airtab.insert(this_dict, typecast=True) new_intakes += 1 else: if this_dict['recent_text'] != m['fields']['recent_text']: this_dict['updated'] = True this_dict['LEA'] = standardize.jcj_lea(raw_lea) else: pass airtab.update(m['id'], this_dict, typecast=True) time.sleep(0.2) wrap_it_up(function='jcj_scraper', t0=t0, new=new_intakes, total=total_intakes)
def hcdc_scraper(): t0, new_intakes, total_intakes = time.time(), 0, 0 main_url = 'http://www.co.hinds.ms.us/pgs/apps/inmate/inmate_list.asp' try: r = requests.get(main_url) except requests.ConnectionError as err: print(f"Skipping {main_url}: {err}") time.sleep(5) return if r.status_code == 500: print(r.text) return soup1 = BeautifulSoup(r.text, 'html.parser') total_pages = int(soup1.h3.string.split()[3]) pages = list(range(1, total_pages + 1)) for page in pages: param_str = f"name_sch=Date&SS1=1&ScrollAction=Page+{page}" r = requests.get(f"{main_url}?{param_str}") soup2 = BeautifulSoup(r.text, 'html.parser') rows = soup2.find_all('tr') for row in rows: cells = row.find_all('td') if len(cells) == 7: total_intakes += 1 this_dict = {'jail': 'hcdc', '_jail': ['recJLBoeZlp4IYn4I']} this_dict['bk'] = row.a.get('href').replace( 'inmate_detail.asp?ID=', '') this_dict['last_verified'] = (datetime.utcnow().replace( tzinfo=timezone.utc).strftime('%m/%d/%Y %H:%M')) m = airtab.match('bk', this_dict['bk'], view='hcdc', fields='recent_text') if m: airtab.update(m['id'], this_dict) else: this_dict['link'] = urllib.parse.urljoin( main_url, row.a.get('href')) try: r = requests.get(this_dict['link']) except requests.ConnectionError as err: print(f"Skipping {this_dict['link']}: {err}") time.sleep(5) continue intake_soup = BeautifulSoup(r.text, 'html.parser') data = [] this_dict['html'] = intake_soup.find_all( 'table')[1].prettify() for string in intake_soup.stripped_strings: data.append(string) try: this_dict['recent_text'] = '\n'.join( data[data.index('Name'):data.index('Disclaimer:')]) except ValueError: this_dict['recent_text'] = '' try: get_name(data[1 + data.index('Name')], this_dict) this_dict['intake_address_line_1'] = data[ 1 + data.index('Address')] this_dict['intake_address_line_2'] = data[ 2 + data.index('Address')] this_dict['DOB'] = data[1 + data.index('Date of Birth')] this_dict['sex'] = data[1 + data.index('Sex')] if data[1 + data.index('Race')] != 'Height': this_dict['race'] = data[1 + data.index('Race')] raw_doi = data[1 + data.index('Arrest Date')] if raw_doi == date.today().strftime('%m/%d/%Y'): this_dict['DOI'] = datetime.now().strftime( '%m/%d/%Y %I:%M%p') elif raw_doi == '//': pass else: this_dict['DOI'] = f"{raw_doi} 11:59pm" raw_lea = data[1 + data.index('Arresting Agency')] this_dict['LEA'] = standardize.hcdc_lea(raw_lea) this_dict['charge_1'] = data[1 + data.index('Charge 1')] except ValueError as err: print(err, f"({this_dict['link']})") try: img_src = urllib.parse.urljoin( main_url, intake_soup.find('img', {'align': 'middle'})['src']) if requests.get(img_src).headers[ 'Content-Type'] == 'image/jpeg': this_dict['img_src'] = img_src this_dict['PHOTO'] = [{'url': img_src}] else: print('image source isn\'t actually an image') except TypeError: print('no pic at this time') airtab.insert(this_dict, typecast=True) new_intakes += 1 wrap_it_up(function='hcdc_scraper', t0=t0, new=new_intakes, total=total_intakes)
def get_charges_from_recent_text(): """This function parces the recent text field and extracts the listed charges.""" t0, i = time.time(), 0 needs_charges_formula = "AND(charges_updated = '', html != '', recent_text != '', hours_since_verification < 72, DONT_DELETE != 'no charges')" records = airtab.get_all(formula=needs_charges_formula) for record in records: this_dict = {} if record["fields"]["jail"] == "lcdc": charges = [] bond_ammounts = [] fine_ammounts = [] soup = BeautifulSoup(record["fields"]["html"], "html.parser").tbody rows = soup.find_all("tr") if soup.tfoot: goods = rows[:len(rows) - 1] this_dict["intake_bond_cash"] = soup.tfoot.find_all( "td")[2].b.string.strip() this_dict["intake_fine_ammount"] = soup.tfoot.find_all( "td")[3].b.string.strip() else: goods = rows for row in goods: cells = row.find_all("td") if cells[0].string.strip(): if "," in cells[0].string.strip(): charges.append('"' + cells[0].string.strip() + '"') else: charges.append(cells[0].string.strip()) if cells[2].string.strip(): bond_ammounts.append(cells[2].string.strip().replace( ",", "")) if cells[3].string.strip(): fine_ammounts.append(cells[3].string.strip().replace( ",", "")) if charges: this_dict["charges"] = ", ".join(charges) if bond_ammounts: this_dict["bond_ammounts"] = "\n".join(bond_ammounts) if fine_ammounts: this_dict["fine_ammounts"] = "\n".join(fine_ammounts) airtab.update(record["id"], this_dict, typecast=True) i += 1 elif record["fields"]["jail"] == "kcdc": charges = [] text = record["fields"]["recent_text"] goods = text[text.find("Charges:"):text.find("Note:")].splitlines() if len(goods) > 1: for good in goods: if "," in good: charges.append('"' + good.strip() + '"') else: charges.append(good) this_dict["charges"] = ", ".join(goods[1:]) airtab.update(record["id"], this_dict) i += 1 elif record["fields"]["jail"] in {"ccdc", "tcdc", "jcdc"}: charges = [] text = record["fields"]["recent_text"] x = text.find("\nCharges:") + 9 y = text.find("\nBond:") goods = text[x:y].strip().splitlines() for line in goods: if "," in line: charges.append('"' + line + '"') else: charges.append(line) this_dict["charges"] = ", ".join(charges) airtab.update(record["id"], this_dict) i += 1 elif record["fields"]["jail"] == "hcdc": messy = [] goods = [] data = record["fields"]["recent_text"].splitlines() messy.append(data[data.index("Charge 1") + 1].strip()) messy.append(data[data.index("Charge 2") + 1].strip()) messy.append(data[data.index("Charge 3") + 1].strip()) messy.append(data[data.index("Charge 4") + 1].strip()) for x in messy: if not x.startswith("Felony / Misd"): if "," in x: goods.append('"' + x + '"') else: goods.append(x) this_dict["charges"] = ", ".join(goods) airtab.update(record["id"], this_dict) i += 1 wrap_it_up(t0, new=i, total=len(records), function='get_charges_from_recent_text')