def get_urls_to_download() -> List[Tuple[str, Dict]]: """Get all of the urls that should be downloaded.""" page = requests.post(LANDING_PAGE, data=_get_landing_data()).text # Formatting on the page is extremely weird so its easiest to just take a # slice of the data. start = page.index(DATE_RANGE_ANCHOR) + len(DATE_RANGE_ANCHOR) + 10 end = start + 50 match = re.match(DATE_RANGE_RE, page[start:end]) if match: date_from = str_field_utils.parse_date(match.group(1)) date_to = str_field_utils.parse_date(match.group(2)) if not (match and date_from and date_to): date_from = datetime.date(year=1995, month=9, day=5) date_to = aggregate_ingest_utils.subtract_month( datetime.date.today().replace(day=1)) aggregate_urls = [] for i in range(date_from.year, date_to.year + 1): month_from = 1 month_to = 12 if i == date_from.year: month_from = date_from.month elif i == date_to.year: month_to = date_to.month reporting_range = 1995 if i < 2002 else 2002 pdf_post_data = _get_pdf_data(i, month_from, month_to, reporting_range) aggregate_urls.append((PDF_URL, pdf_post_data)) return aggregate_urls
def testWrite_SingleCountWithDateAndAllDemographics(self) -> None: params = { "jid": "01001001", "ethnicity": Ethnicity.HISPANIC.value, "gender": Gender.FEMALE.value, "race": Race.BLACK.value, "count": 311, "date": "2019-01-01", } headers = {"X-Appengine-Cron": "test-cron"} response = self.client.get(f"/single_count?{urlencode(params)}", headers=headers) self.assertEqual(response.status_code, 200) # Assert with SessionFactory.using_database(self.database_key, autocommit=False) as session: query = session.query(SingleCountAggregate) result = one(query.all()) self.assertEqual(result.count, params["count"]) date_str = params["date"] if not isinstance(date_str, str): raise ValueError( f"Unexpected type for date_str: [{type(date_str)}]") self.assertEqual(result.date, str_field_utils.parse_date(date_str)) self.assertEqual(result.ethnicity, params["ethnicity"]) self.assertEqual(result.gender, params["gender"]) self.assertEqual(result.race, params["race"])
def testWrite_SingleCountWithDateAndAllDemographics(self): params = { 'jid': '01001001', 'ethnicity': Ethnicity.HISPANIC.value, 'gender': Gender.FEMALE.value, 'race': Race.BLACK.value, 'count': 311, 'date': '2019-01-01', } headers = {'X-Appengine-Cron': 'test-cron'} response = self.client.get(f'/single_count?{urlencode(params)}', headers=headers) self.assertEqual(response.status_code, 200) # Assert query = SessionFactory.for_schema_base(JailsBase).query( SingleCountAggregate) result = one(query.all()) self.assertEqual(result.count, params['count']) self.assertEqual(result.date, str_field_utils.parse_date(params['date'])) self.assertEqual(result.ethnicity, params['ethnicity']) self.assertEqual(result.gender, params['gender']) self.assertEqual(result.race, params['race'])
def _parse_date(filename: str) -> datetime.date: end = filename.index('.pdf') start = end - 7 d = str_field_utils.parse_date(filename[start:end]) if d: return aggregate_ingest_utils.on_last_day_of_month(d) raise AggregateDateParsingError("Could not extract date")
def testWrite_SingleCountWithDateAndAllDemographics(self): params = { "jid": "01001001", "ethnicity": Ethnicity.HISPANIC.value, "gender": Gender.FEMALE.value, "race": Race.BLACK.value, "count": 311, "date": "2019-01-01", } headers = {"X-Appengine-Cron": "test-cron"} response = self.client.get(f"/single_count?{urlencode(params)}", headers=headers) self.assertEqual(response.status_code, 200) # Assert query = SessionFactory.for_schema_base(JailsBase).query( SingleCountAggregate) result = one(query.all()) self.assertEqual(result.count, params["count"]) self.assertEqual(result.date, str_field_utils.parse_date(params["date"])) self.assertEqual(result.ethnicity, params["ethnicity"]) self.assertEqual(result.gender, params["gender"]) self.assertEqual(result.race, params["race"])
def _parse_date(filename: str) -> datetime.date: # Slashes are converted to underscores in the GCS bucket. This # assumes there are no underscores in the URL basename. base_filename = filename.split('_')[-1].replace('female', '') end = base_filename.index('.pdf') start = 4 d = str_field_utils.parse_date(base_filename[start:end]) return aggregate_ingest_utils.on_last_day_of_month(d)
def set_date_specific_lsir_fields( assessment: StateAssessment) -> StateAssessment: """Over time, US_PA has updated the mapping between an LSIR score and the associated assessment level. This function sets the appropriate assessment_level and assessment_score according to the score and the date of the |assessment|, as defined by _DATE_SPECIFIC_ORDERED_LSIR_LEVELS. Returns the updated StateAssessment object. """ if not assessment.assessment_score: return assessment assessment_score = parse_int(assessment.assessment_score) if assessment_score == 60: # This value indicates the scoring was not completed assessment.assessment_score = None assessment.assessment_level = "UNKNOWN (60-ATTEMPTED_INCOMPLETE)" elif assessment_score == 70: # This person either refused to be assessed or did not need to be assessed because they chose not to be released # onto parole assessment.assessment_score = None assessment.assessment_level = "UNKNOWN (70-REFUSED)" elif assessment_score > 55: # Assessment score number is over the max value of 54, and isn't one of the expected special-case # codes (60, 70, 55) assessment.assessment_level = f"UNKNOWN ({assessment_score}-SCORE_OUT_OF_RANGE)" assessment.assessment_score = None else: if assessment_score == 55: # This should be treated as a 54 assessment_score = 54 assessment.assessment_score = "54" assessment_date_raw = assessment.assessment_date assessment_date = (str_field_utils.parse_date(assessment_date_raw) if assessment_date_raw else None) if not assessment_date: # At this point we need a valid assessment_date to determine the date-specific LSIR level assessment.assessment_level = "UNKNOWN (NO_DATE)" return assessment for cutoff_date, score_level_map in _DATE_SPECIFIC_ORDERED_LSIR_LEVELS.items( ): if assessment_date <= cutoff_date: for cutoff_score, level in score_level_map.items(): if assessment_score <= cutoff_score: assessment.assessment_level = level.value return assessment raise ValueError( f"Unhandled assessment_score {assessment_score} with assessment_date {assessment_date}" ) return assessment
def parse_date(filename: str) -> datetime.date: """ Parse the report_date from the filename since the PDF contents can't easily be parsed for the date. """ date_str = filename.replace(' revised', ''). \ replace(' new', '').replace('.pdf', '')[-8:] parsed_date = str_field_utils.parse_date(date_str) if parsed_date: return parsed_date raise AggregateDateParsingError("Could not extract date")
def _parse_date(filename: str) -> datetime.date: end = filename.index(".pdf") start = end - 7 try: d = str_field_utils.parse_date(filename[start:end]) if d: return aggregate_ingest_utils.on_last_day_of_month(d) except Exception: pass # alternate filename format. try: d = str_field_utils.parse_date(filename.split()[0][-7:]) if d: return aggregate_ingest_utils.on_last_day_of_month(d) except Exception: pass raise AggregateDateParsingError(f"Could not extract date from filename: {filename}")
def _parse_date(filename: str) -> datetime.date: # Slashes are converted to underscores in the GCS bucket. This # assumes there are no underscores in the URL basename. base_filename = filename.split("_")[-1].replace("female", "") end = base_filename.index(".pdf") start = 4 d = str_field_utils.parse_date(base_filename[start:end]) if d is None: raise ValueError(f"Unexpected null date parsed from filename [{filename}]") return aggregate_ingest_utils.on_last_day_of_month(d)
def _date_converter(value: Any) -> datetime.date: if not value: return datetime.date.today() if isinstance(value, datetime.date): return value parsed_date = str_field_utils.parse_date(value) if not parsed_date: raise ValueError(f"Failed to parse {value} as a date") return parsed_date
def parse_date(filename: str) -> datetime.date: # Hawaii report pdfs have names that start with `Pop-Reports-EOM-`, followed # by a 10-character date and possibly another number (version?). For example # `Pop-Reports-EOM-2019-03-21.pdf` and `Pop-Reports-EOM-2018-03-31-1.pdf`. regex = r".*?Pop-Reports-EOM-([\d-]{10})" match = re.search(regex, filename, re.IGNORECASE) if match: date_str = match.group(1) parsed_date = str_field_utils.parse_date(date_str) if parsed_date: return parsed_date raise AggregateDateParsingError("Could not extract date")
def _parse_date(filename: str) -> datetime.date: # If this doesn't work, try scraping it from the url name filename_date = filename.lower() if DATE_PARSE_ANCHOR_FILENAME in filename_date: # The names can be a few formats, the most robust way is to take # all of the text after the anchor. # (eg. report Jan 2017.pdf) start = filename_date.index(DATE_PARSE_ANCHOR_FILENAME) \ + len(DATE_PARSE_ANCHOR_FILENAME) date_str = filename_date[start:].strip('.pdf') parsed_date = str_field_utils.parse_date(date_str) if parsed_date: return parsed_date.replace(day=1) raise AggregateDateParsingError("Could not extract date")
def _parse_date(filename: str) -> datetime.date: with open(filename, 'rb') as f: try: pdf = PdfFileReader(f) page = pdf.getPage(0) text = page.extractText() lines = text.split('\n') except Exception as e: raise AggregateDateParsingError(str(e)) from e for index, line in enumerate(lines): if DATE_PARSE_ANCHOR in line: # The date is on the next line if anchor is present on the line parsed_date = str_field_utils.parse_date(lines[index + 1]) if parsed_date: return parsed_date raise AggregateDateParsingError("Could not extract date")
def date_converter_or_today(value: Any) -> datetime.date: """Converts a value to a datetime.date, if possible. If the value is falsy, datetime.date.today() is returned. If the value is already a datetime.date, it is returned directly, otherwise the value is sent to the string utils dateparser. """ if not value: return datetime.date.today() if isinstance(value, datetime.date): return value parsed_date = str_field_utils.parse_date(value) if not parsed_date: raise ValueError(f"Failed to parse {value} as a date") return parsed_date
def testWrite_SingleCountWithDate(self): params = { 'jid': '01001001', 'count': 311, 'date': '2019-01-01', } headers = {'X-Appengine-Cron': 'test-cron'} response = self.client.get(f'/single_count?{urlencode(params)}', headers=headers) self.assertEqual(response.status_code, 200) # Assert query = SessionFactory.for_schema_base(JailsBase).query( SingleCountAggregate) result = one(query.all()) self.assertEqual(result.count, params['count']) self.assertEqual(result.date, str_field_utils.parse_date(params['date']))
def testWrite_SingleCountWithDate(self): params = { "jid": "01001001", "count": 311, "date": "2019-01-01", } headers = {"X-Appengine-Cron": "test-cron"} response = self.client.get(f"/single_count?{urlencode(params)}", headers=headers) self.assertEqual(response.status_code, 200) # Assert query = SessionFactory.for_schema_base(JailsBase).query( SingleCountAggregate) result = one(query.all()) self.assertEqual(result.count, params["count"]) self.assertEqual(result.date, str_field_utils.parse_date(params["date"]))
def _parse_date(filename: str) -> datetime.date: # If this doesn't work, try scraping it from the url name filename_date = filename.lower() if DATE_PARSE_ANCHOR_FILENAME in filename_date: # The names can be a few formats, the most robust way is to take # all of the text after the anchor. # (eg. report Jan 2017.pdf) start = filename_date.index(DATE_PARSE_ANCHOR_FILENAME) \ + len(DATE_PARSE_ANCHOR_FILENAME) date_str = filename_date[start:].strip('.pdf') parsed_date = str_field_utils.parse_date(date_str) if parsed_date: return parsed_date.replace(day=1) try: return datetime.datetime.strptime( filename.split('/')[-1], "_wp-content_uploads_%Y_%m_abbrerptcurrent.pdf") except ValueError as e: raise AggregateDateParsingError("Could not extract date") from e
def convert_field_value(field: attr.Attribute, field_value: Union[str, EnumParser]) -> Any: if field_value is None: return None if is_forward_ref(field) or is_list(field): return field_value if isinstance(field_value, str): if not field_value or not field_value.strip(): return None if field.name in converter_overrides: converter = converter_overrides[field.name] if not isinstance(field_value, converter.field_type): raise ValueError( f"Found converter for field [{field.name}] in the converter_overrides, but expected " f"field type [{converter.field_type}] does not match actual field type " f"[{type(field_value)}]") return converter.convert(field_value) if isinstance(field_value, EnumParser): if is_enum(field): return field_value.parse() raise ValueError( f"Found field value [{field_value}] for field that is not an enum [{field}]." ) if isinstance(field_value, str): if is_str(field): return normalize(field_value) if is_date(field): return parse_date(field_value) if is_int(field): return parse_int(field_value) if field.type in {bool, Union[bool, None]}: return parse_bool(field_value) raise ValueError(f"Unsupported field {field.name}")
def test_parseNoDate(self): assert parse_date('None set') is None
def test_parseDate_zeroes_weird(self): assert parse_date('0 0 0') is None assert parse_date('0000-00-00') is None
def test_parseDate_zeroes(self): assert parse_date('00000000') is None
def test_parseDate(self): assert parse_date('Jan 1, 2018') == \ datetime.date(year=2018, month=1, day=1)
def test_parseDate_zeroes(self) -> None: assert parse_date("00000000") is None
def test_parseNoDate(self) -> None: assert parse_date("None set") is None
def test_parseDate(self) -> None: assert parse_date("Jan 1, 2018") == datetime.date(year=2018, month=1, day=1)
def test_parseDate_no_separators_part_string_part_number(self) -> None: assert parse_date("June2016") == datetime.date(year=2016, month=6, day=1)
def test_parseDate_space_separators_part_string_part_number(self) -> None: assert parse_date("MAY 2003") == datetime.date(year=2003, month=5, day=1)
def test_parseDate_no_separators(self) -> None: assert parse_date("03122008") == datetime.date(year=2008, month=3, day=12)
def test_parseDate_zeroes_weird(self) -> None: assert parse_date("0 0 0") is None assert parse_date("0000-00-00") is None