コード例 #1
0
def get_pixelated_mug():
    """This function uploads the raw image to cloudinary and
    then uploads the pixelated version to the airtable record."""
    t0, i = time.time(), 0
    needs_pix_img_formula = "AND(PHOTO != '', PIXELATED_IMG = '', hours_since_verification < 24, jail != 'jcdc')"
    records = airtab.get_all(formula=needs_pix_img_formula)
    for record in records:
        this_dict = {}
        url = record["fields"]["PHOTO"][0]["url"]
        r = requests.get(url)
        content_type = r.headers['Content-Type']
        # print(content_type)
        if content_type == 'image/jpeg':
            try:
                upload_response = uploader.upload(url,
                                                  opacity=40,
                                                  effect="blur:400")
                time.sleep(1)
                this_dict["PIXELATED_IMG"] = [{
                    "url":
                    upload_response['secure_url']
                }]
                airtab.update(record['id'], this_dict)
            except cloudinary.exceptions.Error as err1:
                print("cloudinary can't accept that shit: ", err1)
            except AttributeError as err2:
                print('Attribute Error for cloudinary upload: ', err2)
        else:
            print('this shit was some really weird content type:',
                  content_type)
    wrap_it_up(t0, new=i, total=len(records), function='get_pixelated_mug')
コード例 #2
0
ファイル: scrapers.py プロジェクト: bfeldman89/jail_scrapers
def update_record(this_dict, soup, m, lea_parser=None, raw_lea=''):
    if this_dict['recent_text'] != m['fields']['recent_text']:
        this_dict['updated'] = True
        this_dict['html'] = soup.prettify()
        if lea_parser:
            lea_parser(raw_lea)
    airtab.update(m['id'], this_dict, typecast=True)
コード例 #3
0
def retry_getting_mugshot():
    t0, i = time.time(), 0
    needs_pic_formula = "AND(PHOTO = '', hours_since_verification < 12, jail != 'lcdc')"
    records = airtab.get_all(formula=needs_pic_formula)
    print("we're gonna retry getting mugs for", len(records), "records...")
    for record in records:
        this_dict = {}
        r = requests.get(record['fields']['link'])
        if record['fields']['jail'] == 'jcdc':
            soup = BeautifulSoup(r.text, 'html.parser').find(id='cms-content')
            img_tag = soup.find('div', class_='inmate_profile_image').img
            if img_tag['alt'] != 'Image Not Availble':
                this_dict[
                    'img_src'] = f"https://www.jonesso.com/{img_tag['src']}"
                this_dict['PHOTO'] = [{'url': this_dict['img_src']}]
            # else:
            #     print('image not currently available')
        elif record['fields']['jail'] == 'hcdc':
            soup = BeautifulSoup(r.text, 'html.parser')
            try:
                img_src = 'http://www.co.hinds.ms.us' + soup.find(
                    'img', {'align': 'middle'})['src']
                if requests.get(
                        img_src).headers['Content-Type'] == 'image/jpeg':
                    this_dict['img_src'] = img_src
                    this_dict['PHOTO'] = [{'url': img_src}]
                else:
                    print('image source isn\'t actually an image')
            except TypeError:
                print('no img tag in intake html')
        elif record['fields']['jail'] == 'kcdc':
            soup = BeautifulSoup(r.text, 'html.parser').find(id='cms-content')
            try:
                img_tag = soup.find('img')
            except AttributeError:
                # print('no img tag in intake html')
                continue
            if soup.img:
                img_src_raw = soup.img['src']
                if img_src_raw.startswith(
                        'templates/kempercountysheriff.com/images/inmates'):
                    this_dict[
                        'img_src'] = f"https://www.kempercountysheriff.com/{img_src_raw}"
                    this_dict['PHOTO'] = [{'url': this_dict['img_src']}]
        elif record['fields']['jail'] == 'acdc':
            soup = BeautifulSoup(r.text, 'html.parser').find(
                'div', class_='blog-content-container')
            try:
                img_tag = soup.find('img')
                this_dict['img_src'] = img_tag.get('src')
                this_dict['PHOTO'] = [{'url': this_dict['img_src']}]
            except AttributeError:
                # print('no img tag in intake html')
                continue
        else:
            print(
                f"awww hell... this one is from the {record['fields']['jail']} docket/scraper..."
            )
        airtab.update(record['id'], this_dict)
    wrap_it_up(t0, new=i, total=len(records), function='retry_getting_mugshot')
コード例 #4
0
def get_all_intake_deets():
    t0, i = time.time(), 0
    jcadc_deets_formula = "AND(jail = 'jcadc', charges = '', recent_text != '')"
    records = airtab.get_all(formula=jcadc_deets_formula, fields='recent_text')
    for record in records:
        charges = []
        bond_ammts = []
        classifications = []
        this_dict = {}
        txt_str = record['fields']['recent_text']
        chunks = txt_str.split('\nRequest Victim Notification\n')
        match_1 = re.search(r"(\w+)\s+(Male|Female)", chunks[0])
        try:
            raw_race = match_1.group(1)
            if raw_race == 'AVAILABLE':
                this_dict['race'] = 'U'
            else:
                this_dict['race'] = raw_race[0]
            this_dict['sex'] = match_1.group(2)[0]
        except AttributeError:
            print('there isnt race/sex info')
        try:
            this_dict['intake_weight'] = re.search(r"(\d+) Pounds",
                                                   chunks[0]).group(1)
        except AttributeError:
            print('there isnt weight info')
        try:
            this_dict['intake_height'] = re.search(r"(\d Ft. \d+ In.)",
                                                   chunks[0]).group(1)
        except AttributeError:
            print('idk how tall this person is')
        try:
            this_dict['intake_eye'] = re.search(r"(\w+)\s+Eyes",
                                                chunks[0]).group(1)
        except AttributeError:
            print('eye color is a mystery')
        this_dict['intake_age'] = re.search(r"(\d\d) Years Old",
                                            chunks[0]).group(1)
        crim_details = chunks[1].splitlines()
        for ln in crim_details:
            results = re.search(r"([MF]\w+) - Bond: (\$.*)", ln)
            if results:
                bond_ammts.append(results.group(2))
                classifications.append(results.group(1))
            elif ', ' in ln:
                charges.append(f"\"{ln}\"")
            else:
                charges.append(ln)
        this_dict['charges'] = ', '.join(charges)
        this_dict['bond_ammounts'] = '\n'.join(bond_ammts)
        this_dict['charge_classifications'] = ', '.join(classifications)
        airtab.update(record['id'], this_dict, typecast=True)
        i += 1
    wrap_it_up(t0, new=i, total=len(records), function='get_all_intake_deets')
コード例 #5
0
def pdf_to_dc(quiet=True):
    # filters for recently verified intakes w/out dc_id.
    # for records meeting that criteria, create pdf & store locally
    t0, i = time.time(), 0
    for jail in jails_lst:
        if not quiet:
            print(f"checking {jail[0]}. . .")
        output_path = os.path.join("output", jail[0])
        try:
            ensure_dir(output_path)
        except NotADirectoryError as err:
            print(f"Skipping {jail[0]}: {err}")
            continue
        for fn in glob.glob(os.path.join(output_path, '*.pdf')):
            if not quiet:
                print(f"uploading {fn} . . .")
            try:
                obj = dc.documents.upload(fn)
            except requests.exceptions.ReadTimeout:
                time.sleep(7)
                continue
            obj = dc.documents.get(obj.id)
            while obj.status != 'success':
                time.sleep(7)
                obj = dc.documents.get(obj.id)
            obj.access = "public"
            this_dict = {"jail": jail[0]}
            obj.data = this_dict
            obj.put()
            this_dict["dc_id"] = str(obj.id)
            print(f"successfully uploaded {obj.id}. . .")
            this_dict["dc_title"] = obj.title
            this_dict["dc_access"] = obj.access
            this_dict["dc_pages"] = obj.pages
            this_dict["PDF"] = obj.pdf_url
            this_dict["dc_canonical_url"] = obj.canonical_url
            this_dict["dc_resources_page_image"] = obj.normal_image_url
            try:
                full_text = obj.full_text.decode("utf-8")
            except AttributeError as err:
                full_text = obj.full_text
            this_dict["dc_full_text"] = os.linesep.join([s for s in full_text.splitlines() if s])
            # record = airtab.match(jail[1], this_dict["dc_title"], view='needs pdf')
            record = airtab.match(jail[1], this_dict["dc_title"], sort=[('dc_id', 'asc'), ('initial_scrape', 'desc')])
            airtab.update(record["id"], this_dict, typecast=True)
            if jail[0] == 'lcdc':
                os.rename(fn, f'/Users/blakefeldman/code/daily-journal-jail-data/pdfs/lee/{this_dict["dc_title"]}.pdf')
            else:
                send2trash.send2trash(fn)
            i += 1
            time.sleep(7)
    wrap_it_up(t0, new=i, total=i, function='pdf_to_dc')
コード例 #6
0
def get_full_text():
    t0, i = time.time(), 0
    records = airtab.get_all(formula="AND(dc_id != '', dc_full_text = '')",
                             fields=['dc_id'])
    for record in records:
        this_dict = {}
        obj = dc.documents.get(record['fields']['dc_id'])
        this_dict["dc_title"] = obj.title
        this_dict["dc_access"] = obj.access
        this_dict["dc_pages"] = obj.pages
        this_dict["dc_full_text"] = obj.full_text.decode("utf-8")
        airtab.update(record["id"], this_dict)
        i += 1
    wrap_it_up(t0, new=i, total=len(records), function='get_full_text')
コード例 #7
0
def update_dc_fields():
    records = airtab.get_all(view='need dc urls updated',
                             fields='dc_id',
                             max_records=100)
    print(len(records), ' records need updated documentcloud URLs.')
    for record in records:
        this_dict = {}
        dc_id = record['fields'].get('dc_id')
        obj = dc.documents.get(dc_id)
        this_dict["PDF"] = obj.pdf_url
        this_dict["dc_canonical_url"] = obj.canonical_url
        this_dict["dc_resources_page_image"] = obj.normal_image_url
        airtab.update(record['id'], this_dict)
        time.sleep(.3)
コード例 #8
0
def update_summary(this_many=150):
    """This function updates the record summary. The reason we have this field,
    rather than just use the 'blurb' field, is bc the gallery view works better
    with a text field than it does with a formula field. Because this view will
    regularly be packed full of records, the default max records is 100."""
    t0, i = time.time(), 0
    # outdated_summary_formula = "AND(blurb != '#ERROR!', blurb != summary)"
    # records = airtab.get_all(formula=outdated_summary_formula, fields="blurb", max_records=this_many)
    records = airtab.get_all(view='needs updated summary',
                             fields="blurb",
                             max_records=this_many)
    for record in records:
        this_dict = {}
        this_dict["summary"] = record["fields"]["blurb"]
        airtab.update(record["id"], this_dict)
    wrap_it_up(t0, new=i, total=len(records), function='update_summary')
コード例 #9
0
def remove_weird_character():
    t0, i = time.time(), 0
    remove_wierd_character_formula = "AND(hours_since_verification > 12, FIND('ã', recent_text) > 1)"
    records = airtab.get_all(formula=remove_wierd_character_formula,
                             fields='recent_text')
    for record in records:
        this_dict = {}
        x = record['fields']['recent_text'].find('ã')
        y = record['fields']['recent_text'].find('\n', x)
        this_dict['recent_text'] = record['fields']['recent_text'].replace(
            record['fields']['recent_text'][x:y], '')
        airtab.update(record['id'], this_dict)
        i += 1
    wrap_it_up(t0,
               new=i,
               total=len(records),
               function='remove_weird_character')
コード例 #10
0
def fix_charges_to_by_lines():
    t0, i = time.time(), 0
    records = airtab.get_all(
        formula="AND(TEST_FORMULA != '', TEST_RESULT = '')", fields='charges')
    for record in records:
        this_dict = {}
        cleaner = []
        mess = record['fields']['charges'].replace('", ', '"\n').replace(
            ', "', '\n"').splitlines()
        for c in mess:
            if c.startswith('"'):
                cleaner.append(c.replace('"', ''))
            else:
                for d in c.split(', '):
                    cleaner.append(d)
        this_dict['TEST_RESULT'] = '\n'.join(cleaner)
        airtab.update(record['id'], this_dict)
        i += 1
    wrap_it_up(t0,
               new=i,
               total=len(records),
               function='fix_charges_to_by_lines')
コード例 #11
0
def get_dor_if_possible(this_many=50):
    t0, i = time.time(), 0
    # records = airtab.get_all(view="check for DOR")
    dor_formula = "AND(OR(jail = 'kcdc', jail = 'tcdc', jail = 'ccdc', jail = 'jcdc'), DOR = '', hours_since_verification > 6, hours_since_verification < 48)"
    records = airtab.get_all(formula=dor_formula, max_records=this_many)
    total = len(records)
    for record in records:
        this_dict = {}
        try:
            r = requests.get(record["fields"]["link"])
        except requests.ConnectionError as err:
            print(f"Skipping {record['fields']['link']}: {err}")
            time.sleep(5)
            continue
        soup = BeautifulSoup(r.text, "html.parser")
        data = []
        for string in soup.stripped_strings:
            data.append(str(string))
        if "Release Date:" in data:
            options = {
                "quiet": "",
                "footer-font-size": 10,
                "footer-left": record["fields"]["link"],
                "footer-right": time.strftime('%c'),
            }
            directory = f"./output/{record['fields']['jail']}/updated"
            try:
                ensure_dir(directory)
                file_name = f"{record['fields']['bk']} (final).pdf"
                fn = os.path.join(directory, file_name)
                pdfkit.from_url(record["fields"]["link"], fn, options=options)
            except NotADirectoryError as err:
                print(f"Can't write PDF: {err}")
            this_dict["DOR"] = datetime.datetime.strptime(
                data[1 + data.index("Release Date:")], "%m-%d-%Y - %I:%M %p"
            ).strftime('%m/%d/%Y %H:%M')
            airtab.update(record["id"], this_dict)
            i += 1
    wrap_it_up(t0, i, total, function='get_dor_if_possible')
コード例 #12
0
def parse_charge_1():
    t0, i = time.time(), 0
    needs_charge_1_parsed_formula = "AND(OR(jail = 'mcdc', jail = 'prcdf'), charge_1_statute = '', hours_since_initial_scrape < 48, charge_1 != '', charge_1 != 'HOLDHOLD', charge_1 != 'DRUGDRUG COURT', charge_1 != 'HLD Other AgencyHold for other Agency')"
    records = airtab.get_all(formula=needs_charge_1_parsed_formula)
    for record in records:
        this_dict = {}
        x = None
        if re.search("[)][A-Z]", record["fields"]["charge_1"]):
            x = re.search("[)][A-Z]", record["fields"]["charge_1"])
        elif re.search("[0-9][A-Z]", record["fields"]["charge_1"]):
            x = re.search("[0-9][A-Z]", record["fields"]["charge_1"])
        if x:
            this_dict["charge_1_statute"] = record["fields"][
                "charge_1"][:x.start() + 1]
            this_dict["charge_1_title"] = record["fields"]["charge_1"][x.end(
            ) - 1:]
            try:
                airtab.update(record["id"], this_dict)
                i += 1
            except requests.exceptions.HTTPError as err:
                print(err)
                continue
    wrap_it_up(t0, new=i, total=len(records), function='parse_charge_1')
コード例 #13
0
ファイル: scrapers.py プロジェクト: bfeldman89/jail_scrapers
def ccj_scraper():
    t0, new_intakes, total_intakes = time.time(), 0, 0
    main_url = 'http://www.calhounso.org/page.php?id=7'
    r = requests.get(main_url, headers=muh_headers)
    soup = BeautifulSoup(r.text, 'html.parser').find(id='cms_body_content')
    try:
        intakes = soup.table.tbody.find_all('td')
    except AttributeError:
        return
    for intake in intakes:
        total_intakes += 1
        this_dict = {'jail': 'ccj'}
        data = []
        link = intake.find('a')
        if link:
            this_dict['link'] = link.get('href')
            this_dict['bk'] = this_dict['link'].replace(
                'http://www.vinelink.com/vinelink/servlet/SubjectSearch?siteID=25000&agency=18&offenderID=',
                '')
        for string in intake.stripped_strings:
            if string.startswith('.') and string.endswith('.'):
                pass
            else:
                data.append(str(string))
        try:
            get_name(data[0], this_dict)
        except IndexError:
            print('skipping empty td')
            continue
        raw_doi = data[1]
        if raw_doi == date.today().strftime('%m/%d/%Y'):
            this_dict['DOI'] = datetime.now().strftime('%m/%d/%Y %I:%M%p')
        else:
            this_dict['DOI'] = f"{raw_doi} 11:59pm"
        remaining_data = ' '.join(data[2:])
        DOR_index = remaining_data.find('Released')
        if DOR_index != -1:
            this_dict['charges'] = remaining_data[:DOR_index].strip()
            raw_dor = remaining_data[DOR_index + 8:].strip()
            if raw_dor == date.today().strftime('%m/%d/%Y'):
                this_dict['DOR'] = datetime.now().strftime('%m/%d/%Y %I:%M%p')
            else:
                this_dict['DOR'] = f"{raw_dor} 12:01am"
        else:
            this_dict['charges'] = remaining_data
        this_dict['recent_text'] = '\n'.join(data)
        this_dict['html'] = intake.prettify()
        this_dict['last_verified'] = (datetime.utcnow().replace(
            tzinfo=timezone.utc).strftime('%m/%d/%Y %H:%M'))
        m = airtab.match('recent_text', this_dict['recent_text'], view='ccj')
        if not m:
            airtab.insert(this_dict, typecast=True)
            new_intakes += 1
        else:
            this_dict['updated'] = True
            airtab.update(m['id'], this_dict, typecast=True)
        time.sleep(0.2)
    wrap_it_up(function='ccj_scraper',
               t0=t0,
               new=new_intakes,
               total=total_intakes)
コード例 #14
0
ファイル: scrapers.py プロジェクト: bfeldman89/jail_scrapers
def jcadc_scraper():
    t0, new_intakes, total_intakes = time.time(), 0, 0
    root = 'https://services.co.jackson.ms.us/jaildocket'
    r = requests.post(f"{root}/_inmateList.php?Function=count",
                      headers=muh_headers)
    total_intakes = r.json()
    last_page = int(total_intakes / 15)
    pages = range(1, last_page + 1)
    pages = list(pages)
    random.shuffle(
        pages
    )  # for some reason page 1 cannot be the first page visited sometimes
    for pg in pages:
        r = requests.post(
            f"{root}/_inmateList.php?Function=list&Page={pg}&Order=BookDesc&Search=0",
            headers=muh_headers)
        try:
            intakes = r.json()
        except ValueError as err:
            print(err)
            continue
        for intake in intakes:
            data = []
            this_dict = {'jail': 'jcadc', '_jail': ['recwShIgdZDcf4ZcJ']}
            this_dict['bk'] = intake["Book_Number"]
            this_dict['last_verified'] = (datetime.utcnow().replace(
                tzinfo=timezone.utc).strftime('%m/%d/%Y %H:%M'))
            this_dict['intake_number'] = intake["ID_Number"].strip()
            this_dict[
                'link'] = f"{root}/inmate/_inmatedetails.php?id={this_dict['intake_number']}"
            r = requests.get(this_dict['link'], headers=muh_headers)
            soup = BeautifulSoup(r.text, 'html.parser')
            for string in soup.stripped_strings:
                data.append(string)
            this_dict['recent_text'] = '\n'.join(data[1:])
            m = airtab.match('bk', this_dict['bk'])
            if m:
                airtab.update(m['id'], this_dict, typecast=True)
            else:
                raw_name = (
                    f"{intake['Name_First_MI']} {intake['Name_Middle']} "
                    f"{intake['Name_Last']} {intake['Name_Suffix']}")
                get_name(raw_name, this_dict)
                raw_doi = intake["BookDate"]
                if raw_doi == date.today().strftime('%m/%d/%Y'):
                    this_dict['DOI'] = datetime.now().strftime(
                        '%m/%d/%Y %I:%M%p')
                else:
                    this_dict['DOI'] = f"{raw_doi} 11:59pm"
                this_dict['DOA'] = intake["ArrestDate"]
                this_dict['LEA'] = standardize.jcadc_lea(
                    intake["Arrest_Agency"])
                articles = soup.find_all('article')
                this_dict['html'] = (
                    "<html>\n<body>\n"
                    f"{articles[0].prettify()}\n{articles[1].prettify()}\n"
                    "</body>\n</html>")
                this_dict[
                    'img_src'] = f"{root}/inmate/{this_dict['intake_number']}.jpg"
                this_dict['PHOTO'] = [{'url': this_dict['img_src']}]
                airtab.insert(this_dict, typecast=True)
                new_intakes += 1
    wrap_it_up(function='jcadc_scraper',
               t0=t0,
               new=new_intakes,
               total=total_intakes)
コード例 #15
0
ファイル: scrapers.py プロジェクト: bfeldman89/jail_scrapers
def jcj_scraper():
    t0, new_intakes, total_intakes = time.time(), 0, 0
    urls = [
        'http://jasperso.com/inmate-roster/',
        'http://jasperso.com/48-hour-release/'
    ]
    for url in urls:
        r = requests.get(url, headers=muh_headers)
        soup = BeautifulSoup(r.text, 'html.parser').find('div', id='primary')
        intakes = soup.find_all('div', class_='col-sm-4 inmate')
        total_intakes += len(intakes)
        for x in intakes:
            this_dict = {'jail': 'jcj', '_jail': ['recwzuzsimZuPpVR5']}
            get_name(x.h1.string.strip(), this_dict)
            this_dict['link'] = url
            data = []
            for string in x.stripped_strings:
                data.append(str(string))
            this_dict['intake_number'] = data[1 + data.index('Arrest #:')]
            this_dict['bk'] = data[1 + data.index('Arrest #:')]
            raw_doi = data[1 + data.index('Arrest Date:')]
            if raw_doi == date.today().strftime('%m/%d/%Y'):
                this_dict['DOI'] = datetime.now().strftime('%m/%d/%Y %I:%M%p')
            else:
                this_dict['DOI'] = f"{raw_doi} 11:59pm"
            if 'Release Date:' in data:
                raw_dor = data[1 + data.index('Release Date:')]
                if raw_dor == date.today().strftime('%m/%d/%Y'):
                    this_dict['DOR'] = datetime.now().strftime(
                        '%m/%d/%Y %I:%M%p')
                else:
                    this_dict['DOR'] = f"{raw_dor} 12:01am"
            this_dict['sex'] = data[1 + data.index('Gender:')].strip()[0:1]
            this_dict['race'] = data[1 + data.index('Race:')].strip()[0:1]
            this_dict['intake_age'] = int(data[1 + data.index('Age:')])
            cleaned_charges = []
            charges = data[1 + data.index('Charges:'):]
            for charge in charges:
                if ', ' in charge:
                    cleaned_charge = f"\"{charge}\""
                else:
                    cleaned_charge = charge
                cleaned_charges.append(cleaned_charge)
            this_dict['charges'] = ', '.join(cleaned_charges)
            this_dict['recent_text'] = '\n'.join(data)
            this_dict['html'] = x.prettify()
            this_dict['last_verified'] = (datetime.utcnow().replace(
                tzinfo=timezone.utc).strftime('%m/%d/%Y %H:%M'))
            raw_lea = data[1 + data.index('Arrest Agency:')]
            m = airtab.match('bk', this_dict['bk'], view='jcj')
            if not m:
                this_dict['img_src'] = x.find('img').get('src')
                this_dict['PHOTO'] = [{'url': this_dict['img_src']}]
                this_dict['LEA'] = standardize.jcj_lea(raw_lea)
                airtab.insert(this_dict, typecast=True)
                new_intakes += 1
            else:
                if this_dict['recent_text'] != m['fields']['recent_text']:
                    this_dict['updated'] = True
                    this_dict['LEA'] = standardize.jcj_lea(raw_lea)
                else:
                    pass
                airtab.update(m['id'], this_dict, typecast=True)
            time.sleep(0.2)
    wrap_it_up(function='jcj_scraper',
               t0=t0,
               new=new_intakes,
               total=total_intakes)
コード例 #16
0
ファイル: scrapers.py プロジェクト: bfeldman89/jail_scrapers
def hcdc_scraper():
    t0, new_intakes, total_intakes = time.time(), 0, 0
    main_url = 'http://www.co.hinds.ms.us/pgs/apps/inmate/inmate_list.asp'
    try:
        r = requests.get(main_url)
    except requests.ConnectionError as err:
        print(f"Skipping {main_url}: {err}")
        time.sleep(5)
        return
    if r.status_code == 500:
        print(r.text)
        return
    soup1 = BeautifulSoup(r.text, 'html.parser')
    total_pages = int(soup1.h3.string.split()[3])
    pages = list(range(1, total_pages + 1))
    for page in pages:
        param_str = f"name_sch=Date&SS1=1&ScrollAction=Page+{page}"
        r = requests.get(f"{main_url}?{param_str}")
        soup2 = BeautifulSoup(r.text, 'html.parser')
        rows = soup2.find_all('tr')
        for row in rows:
            cells = row.find_all('td')
            if len(cells) == 7:
                total_intakes += 1
                this_dict = {'jail': 'hcdc', '_jail': ['recJLBoeZlp4IYn4I']}
                this_dict['bk'] = row.a.get('href').replace(
                    'inmate_detail.asp?ID=', '')
                this_dict['last_verified'] = (datetime.utcnow().replace(
                    tzinfo=timezone.utc).strftime('%m/%d/%Y %H:%M'))
                m = airtab.match('bk',
                                 this_dict['bk'],
                                 view='hcdc',
                                 fields='recent_text')
                if m:
                    airtab.update(m['id'], this_dict)
                else:
                    this_dict['link'] = urllib.parse.urljoin(
                        main_url, row.a.get('href'))
                    try:
                        r = requests.get(this_dict['link'])
                    except requests.ConnectionError as err:
                        print(f"Skipping {this_dict['link']}: {err}")
                        time.sleep(5)
                        continue
                    intake_soup = BeautifulSoup(r.text, 'html.parser')
                    data = []
                    this_dict['html'] = intake_soup.find_all(
                        'table')[1].prettify()
                    for string in intake_soup.stripped_strings:
                        data.append(string)
                    try:
                        this_dict['recent_text'] = '\n'.join(
                            data[data.index('Name'):data.index('Disclaimer:')])
                    except ValueError:
                        this_dict['recent_text'] = ''
                    try:
                        get_name(data[1 + data.index('Name')], this_dict)
                        this_dict['intake_address_line_1'] = data[
                            1 + data.index('Address')]
                        this_dict['intake_address_line_2'] = data[
                            2 + data.index('Address')]
                        this_dict['DOB'] = data[1 +
                                                data.index('Date of Birth')]
                        this_dict['sex'] = data[1 + data.index('Sex')]
                        if data[1 + data.index('Race')] != 'Height':
                            this_dict['race'] = data[1 + data.index('Race')]
                        raw_doi = data[1 + data.index('Arrest Date')]
                        if raw_doi == date.today().strftime('%m/%d/%Y'):
                            this_dict['DOI'] = datetime.now().strftime(
                                '%m/%d/%Y %I:%M%p')
                        elif raw_doi == '//':
                            pass
                        else:
                            this_dict['DOI'] = f"{raw_doi} 11:59pm"
                        raw_lea = data[1 + data.index('Arresting Agency')]
                        this_dict['LEA'] = standardize.hcdc_lea(raw_lea)
                        this_dict['charge_1'] = data[1 +
                                                     data.index('Charge 1')]
                    except ValueError as err:
                        print(err, f"({this_dict['link']})")
                    try:
                        img_src = urllib.parse.urljoin(
                            main_url,
                            intake_soup.find('img',
                                             {'align': 'middle'})['src'])
                        if requests.get(img_src).headers[
                                'Content-Type'] == 'image/jpeg':
                            this_dict['img_src'] = img_src
                            this_dict['PHOTO'] = [{'url': img_src}]
                        else:
                            print('image source isn\'t actually an image')
                    except TypeError:
                        print('no pic at this time')
                    airtab.insert(this_dict, typecast=True)
                    new_intakes += 1
    wrap_it_up(function='hcdc_scraper',
               t0=t0,
               new=new_intakes,
               total=total_intakes)
コード例 #17
0
def get_charges_from_recent_text():
    """This function parces the recent text field and extracts the listed charges."""
    t0, i = time.time(), 0
    needs_charges_formula = "AND(charges_updated = '', html != '', recent_text != '', hours_since_verification < 72, DONT_DELETE != 'no charges')"
    records = airtab.get_all(formula=needs_charges_formula)
    for record in records:
        this_dict = {}
        if record["fields"]["jail"] == "lcdc":
            charges = []
            bond_ammounts = []
            fine_ammounts = []
            soup = BeautifulSoup(record["fields"]["html"], "html.parser").tbody
            rows = soup.find_all("tr")
            if soup.tfoot:
                goods = rows[:len(rows) - 1]
                this_dict["intake_bond_cash"] = soup.tfoot.find_all(
                    "td")[2].b.string.strip()
                this_dict["intake_fine_ammount"] = soup.tfoot.find_all(
                    "td")[3].b.string.strip()
            else:
                goods = rows
            for row in goods:
                cells = row.find_all("td")
                if cells[0].string.strip():
                    if "," in cells[0].string.strip():
                        charges.append('"' + cells[0].string.strip() + '"')
                    else:
                        charges.append(cells[0].string.strip())
                if cells[2].string.strip():
                    bond_ammounts.append(cells[2].string.strip().replace(
                        ",", ""))
                if cells[3].string.strip():
                    fine_ammounts.append(cells[3].string.strip().replace(
                        ",", ""))
            if charges:
                this_dict["charges"] = ", ".join(charges)
            if bond_ammounts:
                this_dict["bond_ammounts"] = "\n".join(bond_ammounts)
            if fine_ammounts:
                this_dict["fine_ammounts"] = "\n".join(fine_ammounts)
            airtab.update(record["id"], this_dict, typecast=True)
            i += 1
        elif record["fields"]["jail"] == "kcdc":
            charges = []
            text = record["fields"]["recent_text"]
            goods = text[text.find("Charges:"):text.find("Note:")].splitlines()
            if len(goods) > 1:
                for good in goods:
                    if "," in good:
                        charges.append('"' + good.strip() + '"')
                    else:
                        charges.append(good)
                this_dict["charges"] = ", ".join(goods[1:])
                airtab.update(record["id"], this_dict)
                i += 1
        elif record["fields"]["jail"] in {"ccdc", "tcdc", "jcdc"}:
            charges = []
            text = record["fields"]["recent_text"]
            x = text.find("\nCharges:") + 9
            y = text.find("\nBond:")
            goods = text[x:y].strip().splitlines()
            for line in goods:
                if "," in line:
                    charges.append('"' + line + '"')
                else:
                    charges.append(line)
            this_dict["charges"] = ", ".join(charges)
            airtab.update(record["id"], this_dict)
            i += 1
        elif record["fields"]["jail"] == "hcdc":
            messy = []
            goods = []
            data = record["fields"]["recent_text"].splitlines()
            messy.append(data[data.index("Charge 1") + 1].strip())
            messy.append(data[data.index("Charge 2") + 1].strip())
            messy.append(data[data.index("Charge 3") + 1].strip())
            messy.append(data[data.index("Charge 4") + 1].strip())
            for x in messy:
                if not x.startswith("Felony / Misd"):
                    if "," in x:
                        goods.append('"' + x + '"')
                    else:
                        goods.append(x)
            this_dict["charges"] = ", ".join(goods)
            airtab.update(record["id"], this_dict)
            i += 1
    wrap_it_up(t0,
               new=i,
               total=len(records),
               function='get_charges_from_recent_text')