async def add_fields_from_incident_url(df, args, predicate=None): log_first_call() def field_name(lst): assert len(set([field.name for field in lst])) == 1 return lst[0].name def field_values(lst): return [field.value for field in lst] subset = df if predicate is None else df.loc[predicate] if len(subset) == 0: # No work to do return df async with Stage2Session(limit_per_host=args.conn_limit) as session: # list of coros of tuples of Fields tasks = subset.apply(session.get_fields_from_incident_url, axis=1) # list of (tuples of Fields) and (exceptions) fields = await asyncio.gather(*tasks, return_exceptions=True) # Temporarily suppress Pandas' SettingWithCopyWarning pd.options.mode.chained_assignment = None try: incident_url_fields_missing = [ isinstance(x, Exception) for x in fields ] subset['incident_url_fields_missing'] = incident_url_fields_missing not_found = [ isinstance(x, ClientResponseError) and x.code == 404 for x in fields ] # list of tuples of Fields fields = [ NIL_FIELDS if isinstance(x, Exception) else x for x in fields ] # tuple of lists of Fields, where each list's Fields should have the same name # if the extractor did its job correctly fields = zip(*fields) fields = [(field_name(lst), field_values(lst)) for lst in fields] for field_name, field_values in fields: assert subset.shape[0] == len(field_values) subset[field_name] = field_values subset = subset.astype(SCHEMA) finally: pd.options.mode.chained_assignment = 'warn' if predicate is not None: df.loc[subset.index] = subset df.drop(index=subset.index[not_found], inplace=True) return df
def add_incident_id(df): log_first_call() def extract_id(incident_url): PREFIX = 'http://www.gunviolencearchive.org/incident/' assert incident_url.startswith(PREFIX) return int(incident_url[len(PREFIX):]) df.insert(0, 'incident_id', df['incident_url'].apply(extract_id)) return df
def get_fields_from_incident_url(self, row, driver): log_first_call() try: return self._get_fields_from_incident_url(row, driver) except Exception as exc: if isinstance(exc, IpBlocked): raise pass # Passing return_exceptions=True to asyncio.gather() destroys the ability # to print them once they're caught, so do that manually here. '''if isinstance(exc, ClientResponseError) and exc.code == 404:
async def get_fields_from_incident_url(self, row): log_first_call() try: return await self._get_fields_from_incident_url(row) except Exception as exc: # Passing return_exceptions=True to asyncio.gather() destroys the ability # to print them once they're caught, so do that manually here. if isinstance(exc, ClientResponseError) and exc.code == 404: # 404 is handled gracefully by us so this isn't too newsworthy. pass else: self._log_extraction_failed(row['incident_url']) tb.print_exc() raise
def extract_fields(self, text, ctx): #html = json.loads(text)['solution']['response'] log_first_call() soup = BeautifulSoup(text, features='html5lib') location_fields = self._extract_location_fields(soup, ctx) participant_fields = self._extract_participant_fields(soup) incident_characteristics = self._extract_incident_characteristics(soup) notes = self._extract_notes(soup) guns_involved_fields = self._extract_guns_involved_fields(soup) sources = self._extract_sources(soup) district_fields = self._extract_district_fields(soup) return _normalize([ *location_fields, *participant_fields, Field('incident_characteristics', incident_characteristics), Field('notes', notes), *guns_involved_fields, Field('sources', sources), *district_fields ])
async def add_fields_from_incident_url(driver, df, args, predicate=None): log_first_call() def field_name(lst): assert len(set([field.name for field in lst])) == 1 return lst[0].name def field_values(lst): return [field.value for field in lst] subset = df if predicate is None else df.loc[predicate] if len(subset) == 0: # No work to do return df async with Stage2Session(limit_per_host=args.conn_limit) as session: #ip_is_blocked = False global columns global incident_ids columns = subset.columns.tolist() for i in range(len(subset)): row = subset.iloc[i] row_to_list = row.tolist() try: extra_fields = session.get_fields_from_incident_url( row, driver) if extra_fields: for field_name, field_values in extra_fields: if i == 0: columns.append(field_name) row_to_list.append(field_values) data_dict[i] = row_to_list incident_ids.append(row_to_list[0]) except Exception as exc: #The only exception it raises is IpBlocked #ip_is_blocked = True print(str(exc)) break df = pd.DataFrame.from_dict(data_dict, orient='index', columns=columns) return df
def load_input(args): log_first_call() return pd.read_csv(args.input_fname, dtype=SCHEMA, parse_dates=['date'], encoding='utf-8')
def _compute_wait(average_wait, rng_base): log_first_call() log_average_wait = math.log(average_wait, rng_base) fuzz = np.random.standard_normal(size=1)[0] return int(np.ceil(rng_base**(log_average_wait + fuzz)))